diff --git a/model_examples/MapTR/.gitignore b/model_examples/MapTR/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..16b77b82098599629a4433a8a8296c8a798e7bd3 --- /dev/null +++ b/model_examples/MapTR/.gitignore @@ -0,0 +1,147 @@ +/*.sh +change_submit.py +cluster_submit.yaml + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +work_dirs +test +val +ckpts +data +.Python +build/ +ckpts/ +data/ +ckpts +data +test/ +val/ +work_dirs/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +cluster.sh diff --git a/model_examples/MapTR/LICENSE b/model_examples/MapTR/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..c0b2dc0c52bd8e21e4bd2d3504489b4b90ff113b --- /dev/null +++ b/model_examples/MapTR/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Hust Vision Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/model_examples/MapTR/README.md b/model_examples/MapTR/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e921a7a03d0584497571e4e1344d2f5b70b92366 --- /dev/null +++ b/model_examples/MapTR/README.md @@ -0,0 +1,191 @@ +# MapTR for PyTorch + +## 目录 + +- [简介](#简介) + - [模型介绍](#模型介绍) + - [支持任务列表](#支持任务列表) + - [代码实现](#代码实现) +- [MapTR](#MapTR) + - [准备训练环境](#准备训练环境) + - [快速开始](#快速开始) + - [训练任务](#训练任务) +- [公网地址说明](#公网地址说明) +- [变更说明](#变更说明) +- [FAQ](#FAQ) + +# 简介 + +## 模型介绍 + +MapTR是一种高效的端到端Transformer模型,用于在线构建矢量化高清地图(HD Map)。高清地图在自动驾驶系统中是规划的基础和关键组件,提供了丰富而精确的环境信息。MapTR提出了一种统一的置换等价建模方法,将地图元素表示为等价置换组的点集,这样不仅可以准确描述地图元素的形状,还能稳定学习过程。此外,MapTR设计了一个分层查询嵌入方案,以灵活地编码结构化地图信息,并执行分层二分匹配来学习地图元素。 + +## 支持任务列表 + +本仓已经支持以下模型任务类型 + +| 模型 | 任务列表 | 是否支持 | +| :---------: | :------: | :------: | +| MapTR | 训练 | ✔ | + +## 代码实现 + +- 参考实现: + + ``` + url=https://github.com/hustvl/MapTR + commit_id=fa420a2e756c9e19b876bdf2f6d33a097d84be73 + ``` + +# MapTR + +## 准备训练环境 + +### 安装环境 + +**表 1** 三方库版本支持表 + +| 三方库 | 支持版本 | +| :-----: | :------: | +| PyTorch | 2.1 | + +### 安装昇腾环境 + +请参考昇腾社区中《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》文档搭建昇腾环境,本仓已支持表2中软件版本。 + +**表 2** 昇腾软件版本支持表 + +| 软件类型 | 支持版本 | +| :---------------: | :------: | +| FrameworkPTAdaper | 6.0.RC3 | +| CANN | 8.0.RC3 | +| 昇腾NPU固件 | 24.1.RC3 | +| 昇腾NPU驱动 | 24.1.RC3 | + +- 安装mmdet3d + + - 在模型根目录下,克隆mmdet3d仓,并进入mmdetection3d目录 + + ``` + git clone -b v1.0.0rc4 https://github.com/open-mmlab/mmdetection3d.git + cd mmdetection3d + ``` + - 在mmdetection3d目录下,修改代码 + + (1)删除requirements/runtime.txt中第3行 numba==0.53.0 + + (2)修改mmdet3d/____init____.py中第22行 mmcv_maximum_version = '1.7.0'为mmcv_maximum_version = '1.7.2' + - 安装包 + + ``` + pip install -v -e . + ``` +- 安装mmcv + + - 在模型根目录下,克隆mmcv仓,并进入mmcv目录安装 + + ``` + git clone -b 1.x https://github.com/open-mmlab/mmcv + cd mmcv + MMCV_WITH_OPS=1 pip install -e . -v + + MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_NPU=1 python setup.py build_ext + MMCV_WITH_OPS=1 FORCE_NPU=1 python setup.py develop + ``` +- 安装mxDriving加速库,安装branch_v6.0.0-RC3分支,具体方法参考[原仓](https://gitee.com/ascend/mxDriving)。 +- 在模型根目录下执行以下命令,安装模型对应PyTorch版本需要的依赖。 + + ``` + pip install -r requirement.txt + ``` +- 在当前python环境下执行`pip show pip`,得到三方包安装路径Location,记作location_path,在模型根目录下执行以下命令来替换patch。 + + ``` + bash replace_patch.sh --packages_path=location_path + ``` + +### 准备数据集 + +- 根据原仓**Prepare Dataset**章节准备数据集,数据集目录及结构如下: + +``` +MapTR +├── ckpts/ +│ ├── resnet50-19c8e357.pth +├── data/ +│ ├── can_bus/ +│ ├── nuscenes/ +│ │ ├── lidarseg/ +│ │ ├── maps/ +│ │ ├── panoptic/ +│ │ ├── samples/ +│ │ ├── v1.0-test/ +| | ├── v1.0-trainval/ +| | ├── nuscenes_infos_temporal_test_mono3d.coco.json +| | ├── nuscenes_infos_temporal_train_mono3d.coco.json +| | ├── nuscenes_infos_temporal_val_mono3d.coco.json +| | ├── nuscenes_map_anns_val.json +| | ├── nuscenes_infos_temporal_test.pkl +| | ├── nuscenes_infos_temporal_train.pkl +| | ├── nuscenes_infos_temporal_val.pkl +├── patch/ +├── projects/ +├── test/ +├── tools/ +``` + +> **说明:** +> nuscenes数据集下的文件,通过运行以下指令生成: +``` +python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes --version v1.0 --canbus ./data +``` + +### 准备预训练权重 + +- 在模型根目录下,执行以下指令下载预训练权重: +``` +mkdir ckpts +cd ckpts +wget https://download.pytorch.org/models/resnet50-19c8e357.pth +``` + +## 快速开始 + +### 训练任务 + +本任务主要提供单机的8卡训练脚本。 + +#### 开始训练 + +1. 在模型根目录下,运行训练脚本。 + + 该模型支持单机8卡训练。 + + - 单机8卡精度训练 + + ``` + bash test/train_8p.sh + ``` + + - 单机8卡性能训练 + + ``` + bash test/train_8p_performance.sh + ``` + +#### 训练结果 + +| 芯片 | 卡数 | global batch size | Precision | epoch | mAP | 性能-单步迭代耗时(ms) | +| ------------- | :--: | :---------------: | :-------: | :---: | :----: | :-------------------: | +| 竞品A | 8p | 32 | fp32 | 24 | 48.7 | 710 | +| Atlas 800T A2 | 8p | 32 | fp32 | 24 | 48.5 | 1100 | + + +# 变更说明 + +2024.11.08:首次发布 + + +# FAQ + +无 \ No newline at end of file diff --git a/model_examples/MapTR/patch/mmcv/_functions.py b/model_examples/MapTR/patch/mmcv/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..460c42a3f622a0340be407e04a5f1f295491b0d4 --- /dev/null +++ b/model_examples/MapTR/patch/mmcv/_functions.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Union + +import torch +from torch import Tensor +from torch.nn.parallel._functions import _get_stream + + +def scatter(input: Union[List, Tensor], + devices: List, + streams: Optional[List] = None) -> Union[List, Tensor]: + """Scatters tensor across multiple GPUs.""" + if streams is None: + streams = [None] * len(devices) + + if isinstance(input, list): + chunk_size = (len(input) - 1) // len(devices) + 1 + outputs = [ + scatter(input[i], [devices[i // chunk_size]], + [streams[i // chunk_size]]) for i in range(len(input)) + ] + return outputs + elif isinstance(input, Tensor): + output = input.contiguous() + # TODO: copy to a pinned buffer first (if copying from CPU) + stream = streams[0] if output.numel() > 0 else None + if devices != [-1]: + with torch.cuda.device(devices[0]), torch.cuda.stream(stream): + output = output.cuda(devices[0], non_blocking=True) + + return output + else: + raise Exception(f'Unknown type {type(input)}.') + + +def synchronize_stream(output: Union[List, Tensor], devices: List, + streams: List) -> None: + if isinstance(output, list): + chunk_size = len(output) // len(devices) + for i in range(len(devices)): + for j in range(chunk_size): + synchronize_stream(output[i * chunk_size + j], [devices[i]], + [streams[i]]) + elif isinstance(output, Tensor): + if output.numel() != 0: + with torch.cuda.device(devices[0]): + main_stream = torch.cuda.current_stream() + main_stream.wait_stream(streams[0]) + output.record_stream(main_stream) + else: + raise Exception(f'Unknown type {type(output)}.') + + +def get_input_device(input: Union[List, Tensor]) -> int: + if isinstance(input, list): + for item in input: + input_device = get_input_device(item) + if input_device != -1: + return input_device + return -1 + elif isinstance(input, Tensor): + return input.get_device() if input.is_cuda else -1 + else: + raise Exception(f'Unknown type {type(input)}.') + + +class Scatter: + + @staticmethod + def forward(target_gpus: List[int], input: Union[List, Tensor]) -> tuple: + input_device = get_input_device(input) + streams = None + if input_device == -1 and target_gpus != [-1]: + # Perform CPU to GPU copies in a background stream + streams = [_get_stream(torch.device("cuda", device)) for device in target_gpus] + + outputs = scatter(input, target_gpus, streams) + # Synchronize with the copy stream + if streams is not None: + synchronize_stream(outputs, target_gpus, streams) + + return tuple(outputs) if isinstance(outputs, list) else (outputs, ) \ No newline at end of file diff --git a/model_examples/MapTR/patch/mmcv/deform_conv.py b/model_examples/MapTR/patch/mmcv/deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..87f03a3d216015be70ba4fa096aaca5d5c6ad39a --- /dev/null +++ b/model_examples/MapTR/patch/mmcv/deform_conv.py @@ -0,0 +1,500 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import torch +import torch_npu +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair, _single + +from mmcv.utils import IS_MLU_AVAILABLE, deprecated_api_warning +from ..cnn import CONV_LAYERS +from ..utils import ext_loader, print_log +from .modulated_deform_conv import ModulatedDeformConv2dFunction + +ext_module = ext_loader.load_ext('_ext', [ + 'deform_conv_forward', 'deform_conv_backward_input', + 'deform_conv_backward_parameters' +]) + + +class DeformConv2dFunction(Function): + + @staticmethod + def symbolic(g, + input, + offset, + weight, + stride, + padding, + dilation, + groups, + deform_groups, + bias=False, + im2col_step=32): + return g.op( + 'mmcv::MMCVDeformConv2d', + input, + offset, + weight, + stride_i=stride, + padding_i=padding, + dilation_i=dilation, + groups_i=groups, + deform_groups_i=deform_groups, + bias_i=bias, + im2col_step_i=im2col_step) + + @staticmethod + def _npu_backward(ctx, grad_output): + input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \ + ctx.saved_tensors + grad_input, grad_weight, grad_offset_all, grad_bias = \ + torch_npu.npu_deformable_conv2dbk( #3 torch.npu_deformable_conv2dbk + input_tensor, grad_output, offset_out, weight, offset_all, + kernel_size=[weight.shape[3], weight.shape[2]], + stride=[1, 1, ctx.stride[0], ctx.stride[1]], + padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1], + ctx.padding[1]], + dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]], + groups=ctx.groups, deformable_groups=ctx.deform_groups, + modulated=True) + grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp) + return grad_input, grad_offset, grad_weight, \ + None, None, None, None, None, None, None + + @staticmethod + def forward(ctx, + input: Tensor, + offset: Tensor, + weight: Tensor, + stride: Union[int, Tuple[int, ...]] = 1, + padding: Union[int, Tuple[int, ...]] = 0, + dilation: Union[int, Tuple[int, ...]] = 1, + groups: int = 1, + deform_groups: int = 1, + bias: bool = False, + im2col_step: int = 32) -> Tensor: + if input is not None and input.dim() != 4: + raise ValueError( + f'Expected 4D tensor as input, got {input.dim()}D tensor \ + instead.') + assert bias is False, 'Only support bias is False.' + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deform_groups = deform_groups + ctx.im2col_step = im2col_step + ctx.device = input.device.type + + # When pytorch version >= 1.6.0, amp is adopted for fp16 mode; + # amp won't cast the type of model (float32), but "offset" is cast + # to float16 by nn.Conv2d automatically, leading to the type + # mismatch with input (when it is float32) or weight. + # The flag for whether to use fp16 or amp is the type of "offset", + # we cast weight and input to temporarily support fp16 and amp + # whatever the pytorch version is. + input = input.type_as(offset) + weight = weight.type_as(input) + if ctx.device == 'npu': + mask_shape, _ = torch.chunk(offset, 2, dim=1) + mask = torch.ones_like(mask_shape).to(input.device) + bias = input.new_empty(0) + output = ModulatedDeformConv2dFunction._npu_forward( + ctx, input, offset, mask, weight, bias) + return output + ctx.save_for_backward(input, offset, weight) + + output = input.new_empty( + DeformConv2dFunction._output_size(ctx, input, weight)) + + ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones + + cur_im2col_step = min(ctx.im2col_step, input.size(0)) + assert (input.size(0) % cur_im2col_step + ) == 0, 'batch size must be divisible by im2col_step' + ext_module.deform_conv_forward( + input, + weight, + offset, + output, + ctx.bufs_[0], + ctx.bufs_[1], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + im2col_step=cur_im2col_step) + return output + + @staticmethod + @once_differentiable + def backward( + ctx, grad_output: Tensor + ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None, + None, None, None, None, None, None]: + if ctx.device == 'npu': + return DeformConv2dFunction._npu_backward(ctx, grad_output) + input, offset, weight = ctx.saved_tensors + + grad_input = grad_offset = grad_weight = None + + cur_im2col_step = min(ctx.im2col_step, input.size(0)) + assert (input.size(0) % cur_im2col_step + ) == 0, 'batch size must be divisible by im2col_step' + + grad_output = grad_output.contiguous() + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + ext_module.deform_conv_backward_input( + input, + offset, + grad_output, + grad_input, + grad_offset, + weight, + ctx.bufs_[0], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + im2col_step=cur_im2col_step) + + if ctx.needs_input_grad[2]: + grad_weight = torch.zeros_like(weight) + ext_module.deform_conv_backward_parameters( + input, + offset, + grad_output, + grad_weight, + ctx.bufs_[0], + ctx.bufs_[1], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + scale=1, + im2col_step=cur_im2col_step) + + return grad_input, grad_offset, grad_weight, \ + None, None, None, None, None, None, None + + @staticmethod + def _output_size(ctx, input, weight): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = ctx.padding[d] + kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = ctx.stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + 'convolution input is too small (output would be ' + + 'x'.join(map(str, output_size)) + ')') + return output_size + + +deform_conv2d = DeformConv2dFunction.apply + + +class DeformConv2d(nn.Module): + r"""Deformable 2D convolution. + + Applies a deformable 2D convolution over an input signal composed of + several input planes. DeformConv2d was described in the paper + `Deformable Convolutional Networks + `_ + + Note: + The argument ``im2col_step`` was added in version 1.3.17, which means + number of samples processed by the ``im2col_cuda_kernel`` per call. + It enables users to define ``batch_size`` and ``im2col_step`` more + flexibly and solved `issue mmcv#1440 + `_. + + Args: + in_channels (int): Number of channels in the input image. + out_channels (int): Number of channels produced by the convolution. + kernel_size(int, tuple): Size of the convolving kernel. + stride(int, tuple): Stride of the convolution. Default: 1. + padding (int or tuple): Zero-padding added to both sides of the input. + Default: 0. + dilation (int or tuple): Spacing between kernel elements. Default: 1. + groups (int): Number of blocked connections from input. + channels to output channels. Default: 1. + deform_groups (int): Number of deformable group partitions. + bias (bool): If True, adds a learnable bias to the output. + Default: False. + im2col_step (int): Number of samples processed by im2col_cuda_kernel + per call. It will work when ``batch_size`` > ``im2col_step``, but + ``batch_size`` must be divisible by ``im2col_step``. Default: 32. + `New in version 1.3.17.` + """ + + @deprecated_api_warning({'deformable_groups': 'deform_groups'}, + cls_name='DeformConv2d') + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, ...]], + stride: Union[int, Tuple[int, ...]] = 1, + padding: Union[int, Tuple[int, ...]] = 0, + dilation: Union[int, Tuple[int, ...]] = 1, + groups: int = 1, + deform_groups: int = 1, + bias: bool = False, + im2col_step: int = 32) -> None: + super().__init__() + + assert not bias, \ + f'bias={bias} is not supported in DeformConv2d.' + assert in_channels % groups == 0, \ + f'in_channels {in_channels} cannot be divisible by groups {groups}' + assert out_channels % groups == 0, \ + f'out_channels {out_channels} cannot be divisible by groups \ + {groups}' + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deform_groups = deform_groups + self.im2col_step = im2col_step + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + # only weight, no bias + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // self.groups, + *self.kernel_size)) + + self.reset_parameters() + + def reset_parameters(self): + # switch the initialization of `self.weight` to the standard kaiming + # method described in `Delving deep into rectifiers: Surpassing + # human-level performance on ImageNet classification` - He, K. et al. + # (2015), using a uniform distribution + nn.init.kaiming_uniform_(self.weight, nonlinearity='relu') + + def forward(self, x: Tensor, offset: Tensor) -> Tensor: + """Deformable Convolutional forward function. + + Args: + x (Tensor): Input feature, shape (B, C_in, H_in, W_in) + offset (Tensor): Offset for deformable convolution, shape + (B, deform_groups*kernel_size[0]*kernel_size[1]*2, + H_out, W_out), H_out, W_out are equal to the output's. + + An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. + The spatial arrangement is like: + + .. code:: text + + (x0, y0) (x1, y1) (x2, y2) + (x3, y3) (x4, y4) (x5, y5) + (x6, y6) (x7, y7) (x8, y8) + + Returns: + Tensor: Output of the layer. + """ + # To fix an assert error in deform_conv_cuda.cpp:128 + # input image is smaller than kernel + input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) < + self.kernel_size[1]) + if input_pad: + pad_h = max(self.kernel_size[0] - x.size(2), 0) + pad_w = max(self.kernel_size[1] - x.size(3), 0) + x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous() + offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0) + offset = offset.contiguous() + out = deform_conv2d(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deform_groups, + False, self.im2col_step) + if input_pad: + out = out[:, :, :out.size(2) - pad_h, :out.size(3) - + pad_w].contiguous() + return out + + def __repr__(self): + s = self.__class__.__name__ + s += f'(in_channels={self.in_channels},\n' + s += f'out_channels={self.out_channels},\n' + s += f'kernel_size={self.kernel_size},\n' + s += f'stride={self.stride},\n' + s += f'padding={self.padding},\n' + s += f'dilation={self.dilation},\n' + s += f'groups={self.groups},\n' + s += f'deform_groups={self.deform_groups},\n' + # bias is not supported in DeformConv2d. + s += 'bias=False)' + return s + + +@CONV_LAYERS.register_module('DCN') +class DeformConv2dPack(DeformConv2d): + """A Deformable Conv Encapsulation that acts as normal Conv layers. + + The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. + The spatial arrangement is like: + + .. code:: text + + (x0, y0) (x1, y1) (x2, y2) + (x3, y3) (x4, y4) (x5, y5) + (x6, y6) (x7, y7) (x8, y8) + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int or tuple[int]): Same as nn.Conv2d. + padding (int or tuple[int]): Same as nn.Conv2d. + dilation (int or tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + """ + + _version = 2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + dilation=_pair(self.dilation), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x: Tensor) -> Tensor: # type: ignore + offset = self.conv_offset(x) + return deform_conv2d(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deform_groups, + False, self.im2col_step) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + + if version is None or version < 2: + # the key is different in early versions + # In version < 2, DeformConvPack loads previous benchmark models. + if (prefix + 'conv_offset.weight' not in state_dict + and prefix[:-1] + '_offset.weight' in state_dict): + state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( + prefix[:-1] + '_offset.weight') + if (prefix + 'conv_offset.bias' not in state_dict + and prefix[:-1] + '_offset.bias' in state_dict): + state_dict[prefix + + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + + '_offset.bias') + + if version is not None and version > 1: + print_log( + f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to ' + 'version 2.', + logger='root') + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) + + +if IS_MLU_AVAILABLE: + import torchvision + from torchvision.ops import deform_conv2d as tv_deform_conv2d + + from mmcv.utils import digit_version + + @CONV_LAYERS.register_module('DCN', force=True) + class DeformConv2dPack_MLU(DeformConv2d): + """This class is the DCN implementation of the MLU device. The MLU + backend support of the operator has been implemented in torchvision. + The mmcv registration mechanism is used for multiplexing here. The + torchvision implementation of DCN is called. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int): Same as nn.Conv2d, while tuple is not supported. + padding (int): Same as nn.Conv2d, while tuple is not supported. + dilation (int): Same as nn.Conv2d, while tuple is not supported. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias will be set as True if norm_cfg is None, + otherwise False. + im2col_step (int): Number of samples processed by + im2col_cuda_kernel per call. It will work when ``batch_size`` + > ``im2col_step``, but ``batch_size`` must be divisible by + ``im2col_step``. Default: 32. `New in version 1.7.2. + Currently not supported on MLU devices.` + """ + + def __init__(self, *args, **kwargs): + assert digit_version(torchvision.__version__) >= digit_version( + '0.10.0a0'), 'the version of torchvision should be >= 0.10.0' + super().__init__(*args, **kwargs) + + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deform_groups * 2 * self.kernel_size[0] * + self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + dilation=_pair(self.dilation), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x: Tensor) -> Tensor: # type: ignore + cur_im2col_step = min(self.im2col_step, x.size(0)) + assert (x.size(0) % cur_im2col_step + ) == 0, 'batch size must be divisible by im2col_step' + offset = self.conv_offset(x) + x = x.type_as(offset) + weight = self.weight.type_as(x) + return tv_deform_conv2d(x, offset, weight, None, self.stride, + self.padding, self.dilation) diff --git a/model_examples/MapTR/patch/mmcv/distributed.py b/model_examples/MapTR/patch/mmcv/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..f0dfecc9c2e2cfbc8e7173c31fcce169e3e61388 --- /dev/null +++ b/model_examples/MapTR/patch/mmcv/distributed.py @@ -0,0 +1,166 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, List, Tuple + +import torch +from torch.nn.parallel.distributed import (DistributedDataParallel, + _find_tensors) + +from mmcv import print_log +from mmcv.utils import TORCH_VERSION, digit_version +from .scatter_gather import ScatterInputs, scatter_kwargs + + +class MMDistributedDataParallel(DistributedDataParallel): + """The DDP module that supports DataContainer. + + MMDDP has two main differences with PyTorch DDP: + + - It supports a custom type :class:`DataContainer` which allows more + flexible control of input data. + - It implement two APIs ``train_step()`` and ``val_step()``. + """ + + def to_kwargs(self, inputs: ScatterInputs, kwargs: ScatterInputs, + device_id: int) -> Tuple[tuple, tuple]: + # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8 + # to move all tensors to device_id + return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim) + + def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs, + device_ids: List[int]) -> Tuple[tuple, tuple]: + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def train_step(self, *inputs, **kwargs): + """train_step() API for module wrapped by DistributedDataParallel. + + This method is basically the same as + ``DistributedDataParallel.forward()``, while replacing + ``self.module.forward()`` with ``self.module.train_step()``. + It is compatible with PyTorch 1.1 - 1.5. + """ + + # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the + # end of backward to the beginning of forward. + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.7') + and self.reducer._rebuild_buckets()): + print_log( + 'Reducer buckets have been rebuilt in this iteration.', + logger='mmcv') + + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')): + if self._check_sync_bufs_pre_fwd(): + self._sync_buffers() + else: + if (getattr(self, 'require_forward_param_sync', False) + and self.require_forward_param_sync): + self._sync_params() + + if self.device_ids: + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + if len(self.device_ids) == 1: + output = self.module.train_step(*inputs[0], **kwargs[0]) + else: + outputs = self.parallel_apply( + self._module_copies[:len(inputs)], inputs, kwargs) + output = self.gather(outputs, self.output_device) + else: + output = self.module.train_step(*inputs, **kwargs) + + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')): + if self._check_sync_bufs_post_fwd(): + self._sync_buffers() + + if (torch.is_grad_enabled() + and getattr(self, 'require_backward_grad_sync', False) + and self.require_backward_grad_sync): + if self.find_unused_parameters: + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) > digit_version('1.2')): + self.require_forward_param_sync = False + return output + + def val_step(self, *inputs, **kwargs): + """val_step() API for module wrapped by DistributedDataParallel. + + This method is basically the same as + ``DistributedDataParallel.forward()``, while replacing + ``self.module.forward()`` with ``self.module.val_step()``. + It is compatible with PyTorch 1.1 - 1.5. + """ + # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the + # end of backward to the beginning of forward. + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.7') + and self.reducer._rebuild_buckets()): + print_log( + 'Reducer buckets have been rebuilt in this iteration.', + logger='mmcv') + + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')): + if self._check_sync_bufs_pre_fwd(): + self._sync_buffers() + else: + if (getattr(self, 'require_forward_param_sync', False) + and self.require_forward_param_sync): + self._sync_params() + + if self.device_ids: + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + if len(self.device_ids) == 1: + output = self.module.val_step(*inputs[0], **kwargs[0]) + else: + outputs = self.parallel_apply( + self._module_copies[:len(inputs)], inputs, kwargs) + output = self.gather(outputs, self.output_device) + else: + output = self.module.val_step(*inputs, **kwargs) + + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')): + if self._check_sync_bufs_post_fwd(): + self._sync_buffers() + + if (torch.is_grad_enabled() + and getattr(self, 'require_backward_grad_sync', False) + and self.require_backward_grad_sync): + if self.find_unused_parameters: + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) > digit_version('1.2')): + self.require_forward_param_sync = False + return output + + def _run_ddp_forward(self, *inputs, **kwargs) -> Any: + """Processes inputs and runs ``self.module.forward``. + + Pytorch 1.12.0 performs ``self.module.forward`` in ``_run_ddp_forward`` + and deprecates using ``DistributedDataParallel.to_kwargs`` to + process inputs, which leads to inputs cannot be processed by + :meth:`MMDistributedDataParallel.to_kwargs` anymore. Therefore, + ``MMDistributedDataParallel`` overrides this method to call + :meth:`to_kwargs` explicitly. + + See more information in ``_. # noqa: E501 + + Returns: + Any: Forward result of :attr:`module`. + """ + module_to_run = self.module + + if self.device_ids: + inputs, kwargs = self.to_kwargs( # type: ignore + inputs, kwargs, self.device_ids[0]) + return module_to_run(*inputs[0], **kwargs[0]) # type: ignore + else: + return module_to_run(*inputs, **kwargs) diff --git a/model_examples/MapTR/patch/mmdet/__init__.py b/model_examples/MapTR/patch/mmdet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2d0ed2efe47b5b370f6b664936052ab663fd6449 --- /dev/null +++ b/model_examples/MapTR/patch/mmdet/__init__.py @@ -0,0 +1,28 @@ +import mmcv + +from .version import __version__, short_version + + +def digit_version(version_str): + digit_version = [] + for x in version_str.split('.'): + if x.isdigit(): + digit_version.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + digit_version.append(int(patch_version[0]) - 1) + digit_version.append(int(patch_version[1])) + return digit_version + + +mmcv_minimum_version = '1.3.8' +mmcv_maximum_version = '1.7.2' +mmcv_version = digit_version(mmcv.__version__) + + +assert (mmcv_version >= digit_version(mmcv_minimum_version) + and mmcv_version <= digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.' + +__all__ = ['__version__', 'short_version'] diff --git a/model_examples/MapTR/patch/mmdet/resnet.py b/model_examples/MapTR/patch/mmdet/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..8e2da57a4786bef50afcf257b1aeea91cf3448c3 --- /dev/null +++ b/model_examples/MapTR/patch/mmdet/resnet.py @@ -0,0 +1,675 @@ +import warnings + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer +from mmcv.runner import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from ..utils import ResLayer + +from mx_driving.fused import npu_add_relu, npu_max_pool2d #3 + +class BasicBlock(BaseModule): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None, + init_cfg=None): + super(BasicBlock, self).__init__(init_cfg) + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + 3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, planes, planes, 3, padding=1, bias=False) + self.add_module(self.norm2_name, norm2) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + # out += identity #3 + out = npu_add_relu(out, identity) #3 + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + # out = self.relu(out) #3 + + return out + + +class Bottleneck(BaseModule): + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None, + init_cfg=None): + """Bottleneck block for ResNet. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottleneck, self).__init__(init_cfg) + assert style in ['pytorch', 'caffe'] + assert dcn is None or isinstance(dcn, dict) + assert plugins is None or isinstance(plugins, list) + if plugins is not None: + allowed_position = ['after_conv1', 'after_conv2', 'after_conv3'] + assert all(p['position'] in allowed_position for p in plugins) + + self.inplanes = inplanes + self.planes = planes + self.stride = stride + self.dilation = dilation + self.style = style + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.dcn = dcn + self.with_dcn = dcn is not None + self.plugins = plugins + self.with_plugins = plugins is not None + + if self.with_plugins: + # collect plugins for conv1/conv2/conv3 + self.after_conv1_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv1' + ] + self.after_conv2_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv2' + ] + self.after_conv3_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv3' + ] + + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + norm_cfg, planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + conv_cfg, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + dcn, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + conv_cfg, + planes, + planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + if self.with_plugins: + self.after_conv1_plugin_names = self.make_block_plugins( + planes, self.after_conv1_plugins) + self.after_conv2_plugin_names = self.make_block_plugins( + planes, self.after_conv2_plugins) + self.after_conv3_plugin_names = self.make_block_plugins( + planes * self.expansion, self.after_conv3_plugins) + + def make_block_plugins(self, in_channels, plugins): + """make plugins for block. + + Args: + in_channels (int): Input channels of plugin. + plugins (list[dict]): List of plugins cfg to build. + + Returns: + list[str]: List of the names of plugin. + """ + assert isinstance(plugins, list) + plugin_names = [] + for plugin in plugins: + plugin = plugin.copy() + name, layer = build_plugin_layer( + plugin, + in_channels=in_channels, + postfix=plugin.pop('postfix', '')) + assert not hasattr(self, name), f'duplicate plugin {name}' + self.add_module(name, layer) + plugin_names.append(name) + return plugin_names + + def forward_plugin(self, x, plugin_names): + out = x + for name in plugin_names: + out = getattr(self, name)(x) + return out + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + @property + def norm3(self): + """nn.Module: normalization layer after the third convolution layer""" + return getattr(self, self.norm3_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + # out += identity #3 + out = npu_add_relu(out, identity) #3 + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + # out = self.relu(out) #3 + + return out + + +@BACKBONES.register_module() +class ResNet(BaseModule): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + stem_channels (int | None): Number of stem channels. If not specified, + it will be the same as `base_channels`. Default: None. + base_channels (int): Number of base channels of res layer. Default: 64. + in_channels (int): Number of input image channels. Default: 3. + num_stages (int): Resnet stages. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - position (str, required): Position inside block to insert + plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. + pretrained (str, optional): model pretrained path. Default: None + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + + Example: + >>> from mmdet.models import ResNet + >>> import torch + >>> self = ResNet(depth=18) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 64, 8, 8) + (1, 128, 4, 4) + (1, 256, 2, 2) + (1, 512, 1, 1) + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + in_channels=3, + stem_channels=None, + base_channels=64, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + dcn=None, + stage_with_dcn=(False, False, False, False), + plugins=None, + with_cp=False, + zero_init_residual=True, + pretrained=None, + init_cfg=None): + super(ResNet, self).__init__(init_cfg) + self.zero_init_residual = zero_init_residual + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + + block_init_cfg = None + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be setting at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + block = self.arch_settings[depth][0] + if self.zero_init_residual: + if block is BasicBlock: + block_init_cfg = dict( + type='Constant', + val=0, + override=dict(name='norm2')) + elif block is Bottleneck: + block_init_cfg = dict( + type='Constant', + val=0, + override=dict(name='norm3')) + else: + raise TypeError('pretrained must be a str or None') + + self.depth = depth + if stem_channels is None: + stem_channels = base_channels + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert num_stages >= 1 and num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.dcn = dcn + self.stage_with_dcn = stage_with_dcn + if dcn is not None: + assert len(stage_with_dcn) == num_stages + self.plugins = plugins + self.block, stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + self.inplanes = stem_channels + + self._make_stem_layer(in_channels, stem_channels) + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = strides[i] + dilation = dilations[i] + dcn = self.dcn if self.stage_with_dcn[i] else None + if plugins is not None: + stage_plugins = self.make_stage_plugins(plugins, i) + else: + stage_plugins = None + planes = base_channels * 2**i + res_layer = self.make_res_layer( + block=self.block, + inplanes=self.inplanes, + planes=planes, + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + dcn=dcn, + plugins=stage_plugins, + init_cfg=block_init_cfg) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = self.block.expansion * base_channels * 2**( + len(self.stage_blocks) - 1) + + def make_stage_plugins(self, plugins, stage_idx): + """Make plugins for ResNet ``stage_idx`` th stage. + + Currently we support to insert ``context_block``, + ``empirical_attention_block``, ``nonlocal_block`` into the backbone + like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of + Bottleneck. + + An example of plugins format could be: + + Examples: + >>> plugins=[ + ... dict(cfg=dict(type='xxx', arg1='xxx'), + ... stages=(False, True, True, True), + ... position='after_conv2'), + ... dict(cfg=dict(type='yyy'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='1'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='2'), + ... stages=(True, True, True, True), + ... position='after_conv3') + ... ] + >>> self = ResNet(depth=18) + >>> stage_plugins = self.make_stage_plugins(plugins, 0) + >>> assert len(stage_plugins) == 3 + + Suppose ``stage_idx=0``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1-> conv2->conv3->yyy->zzz1->zzz2 + + Suppose 'stage_idx=1', the structure of blocks in the stage would be: + + .. code-block:: none + + conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2 + + If stages is missing, the plugin would be applied to all stages. + + Args: + plugins (list[dict]): List of plugins cfg to build. The postfix is + required if multiple same type plugins are inserted. + stage_idx (int): Index of stage to build + + Returns: + list[dict]: Plugins for current stage + """ + stage_plugins = [] + for plugin in plugins: + plugin = plugin.copy() + stages = plugin.pop('stages', None) + assert stages is None or len(stages) == self.num_stages + # whether to insert plugin into current stage + if stages is None or stages[stage_idx]: + stage_plugins.append(plugin) + + return stage_plugins + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``.""" + return ResLayer(**kwargs) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def _make_stem_layer(self, in_channels, stem_channels): + if self.deep_stem: + self.stem = nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels // 2, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels // 2, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels)[1], + nn.ReLU(inplace=True)) + else: + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels, + kernel_size=7, + stride=2, + padding=3, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, stem_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + if self.deep_stem: + self.stem.eval() + for param in self.stem.parameters(): + param.requires_grad = False + else: + self.norm1.eval() + for m in [self.conv1, self.norm1]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def forward(self, x): + """Forward function.""" + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + # x = self.maxpool(x) #3 + x = npu_max_pool2d(x, 3, 2, 1) #3 + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(ResNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + +@BACKBONES.register_module() +class ResNetV1d(ResNet): + r"""ResNetV1d variant described in `Bag of Tricks + `_. + + Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in + the input stem with three 3x3 convs. And in the downsampling block, a 2x2 + avg_pool with stride 2 is added before conv, whose stride is changed to 1. + """ + + def __init__(self, **kwargs): + super(ResNetV1d, self).__init__( + deep_stem=True, avg_down=True, **kwargs) diff --git a/model_examples/MapTR/patch/mmdet3d/__init__.py b/model_examples/MapTR/patch/mmdet3d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..870d7db0544318c6e47a511d914e0795a7554f5d --- /dev/null +++ b/model_examples/MapTR/patch/mmdet3d/__init__.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv + +import mmdet +import mmseg +from .version import __version__, short_version + + +def digit_version(version_str): + digit_version = [] + for x in version_str.split('.'): + if x.isdigit(): + digit_version.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + digit_version.append(int(patch_version[0]) - 1) + digit_version.append(int(patch_version[1])) + return digit_version + + +mmcv_minimum_version = '1.5.2' +mmcv_maximum_version = '1.7.2' +mmcv_version = digit_version(mmcv.__version__) + + +assert (mmcv_version >= digit_version(mmcv_minimum_version) + and mmcv_version <= digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.' + +mmdet_minimum_version = '2.14.0' +mmdet_maximum_version = '3.0.0' +mmdet_version = digit_version(mmdet.__version__) +assert (mmdet_version >= digit_version(mmdet_minimum_version) + and mmdet_version <= digit_version(mmdet_maximum_version)), \ + f'MMDET=={mmdet.__version__} is used but incompatible. ' \ + f'Please install mmdet>={mmdet_minimum_version}, ' \ + f'<={mmdet_maximum_version}.' + +mmseg_minimum_version = '0.14.1' +mmseg_maximum_version = '1.0.0' +mmseg_version = digit_version(mmseg.__version__) +assert (mmseg_version >= digit_version(mmseg_minimum_version) + and mmseg_version <= digit_version(mmseg_maximum_version)), \ + f'MMSEG=={mmseg.__version__} is used but incompatible. ' \ + f'Please install mmseg>={mmseg_minimum_version}, ' \ + f'<={mmseg_maximum_version}.' + +__all__ = ['__version__', 'short_version'] diff --git a/model_examples/MapTR/patch/mmseg/__init__.py b/model_examples/MapTR/patch/mmseg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ed005aab6048cc8aad4ffdf7df0be4797253b6d --- /dev/null +++ b/model_examples/MapTR/patch/mmseg/__init__.py @@ -0,0 +1,30 @@ +import mmcv + +from .version import __version__, version_info + +MMCV_MIN = '1.3.1' +MMCV_MAX = '1.7.2' + + +def digit_version(version_str): + digit_version = [] + for x in version_str.split('.'): + if x.isdigit(): + digit_version.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + digit_version.append(int(patch_version[0]) - 1) + digit_version.append(int(patch_version[1])) + return digit_version + + +mmcv_min_version = digit_version(MMCV_MIN) +mmcv_max_version = digit_version(MMCV_MAX) +mmcv_version = digit_version(mmcv.__version__) + + +assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_min_version}, <={mmcv_max_version}.' + +__all__ = ['__version__', 'version_info'] diff --git a/model_examples/MapTR/patch/nuscenes/data_classes.py b/model_examples/MapTR/patch/nuscenes/data_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..b95d6ce2738dca7ea37c99b08c48eb289e3c9844 --- /dev/null +++ b/model_examples/MapTR/patch/nuscenes/data_classes.py @@ -0,0 +1,425 @@ +# nuScenes dev-kit. +# Code written by Oscar Beijbom, 2019. + +from collections import defaultdict +from typing import List, Dict, Tuple + +import numpy as np + +from nuscenes.eval.common.data_classes import MetricData, EvalBox +from nuscenes.eval.common.utils import center_distance +from nuscenes.eval.detection.constants import DETECTION_NAMES, ATTRIBUTE_NAMES, TP_METRICS + + +class DetectionConfig: + """ Data class that specifies the detection evaluation settings. """ + + def __init__(self, + class_range: Dict[str, int], + dist_fcn: str, + dist_ths: List[float], + dist_th_tp: float, + min_recall: float, + min_precision: float, + max_boxes_per_sample: int, + mean_ap_weight: int): + + assert set(class_range.keys()) == set(DETECTION_NAMES), "Class count mismatch." + assert dist_th_tp in dist_ths, "dist_th_tp must be in set of dist_ths." + + self.class_range = class_range + self.dist_fcn = dist_fcn + self.dist_ths = dist_ths + self.dist_th_tp = dist_th_tp + self.min_recall = min_recall + self.min_precision = min_precision + self.max_boxes_per_sample = max_boxes_per_sample + self.mean_ap_weight = mean_ap_weight + + self.class_names = self.class_range.keys() + + def __eq__(self, other): + eq = True + for key in self.serialize().keys(): + eq = eq and np.array_equal(getattr(self, key), getattr(other, key)) + return eq + + def serialize(self) -> dict: + """ Serialize instance into json-friendly format. """ + return { + 'class_range': self.class_range, + 'dist_fcn': self.dist_fcn, + 'dist_ths': self.dist_ths, + 'dist_th_tp': self.dist_th_tp, + 'min_recall': self.min_recall, + 'min_precision': self.min_precision, + 'max_boxes_per_sample': self.max_boxes_per_sample, + 'mean_ap_weight': self.mean_ap_weight + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized dictionary. """ + return cls(content['class_range'], + content['dist_fcn'], + content['dist_ths'], + content['dist_th_tp'], + content['min_recall'], + content['min_precision'], + content['max_boxes_per_sample'], + content['mean_ap_weight']) + + @property + def dist_fcn_callable(self): + """ Return the distance function corresponding to the dist_fcn string. """ + if self.dist_fcn == 'center_distance': + return center_distance + else: + raise Exception('Error: Unknown distance function %s!' % self.dist_fcn) + + +class DetectionMetricData(MetricData): + """ This class holds accumulated and interpolated data required to calculate the detection metrics. """ + + nelem = 101 + + def __init__(self, + recall: np.array, + precision: np.array, + confidence: np.array, + trans_err: np.array, + vel_err: np.array, + scale_err: np.array, + orient_err: np.array, + attr_err: np.array): + + # Assert lengths. + assert len(recall) == self.nelem + assert len(precision) == self.nelem + assert len(confidence) == self.nelem + assert len(trans_err) == self.nelem + assert len(vel_err) == self.nelem + assert len(scale_err) == self.nelem + assert len(orient_err) == self.nelem + assert len(attr_err) == self.nelem + + # Assert ordering. + assert all(confidence == sorted(confidence, reverse=True)) # Confidences should be descending. + assert all(recall == sorted(recall)) # Recalls should be ascending. + + # Set attributes explicitly to help IDEs figure out what is going on. + self.recall = recall + self.precision = precision + self.confidence = confidence + self.trans_err = trans_err + self.vel_err = vel_err + self.scale_err = scale_err + self.orient_err = orient_err + self.attr_err = attr_err + + def __eq__(self, other): + eq = True + for key in self.serialize().keys(): + eq = eq and np.array_equal(getattr(self, key), getattr(other, key)) + return eq + + @property + def max_recall_ind(self): + """ Returns index of max recall achieved. """ + + # Last instance of confidence > 0 is index of max achieved recall. + non_zero = np.nonzero(self.confidence)[0] + if len(non_zero) == 0: # If there are no matches, all the confidence values will be zero. + max_recall_ind = 0 + else: + max_recall_ind = non_zero[-1] + + return max_recall_ind + + @property + def max_recall(self): + """ Returns max recall achieved. """ + + return self.recall[self.max_recall_ind] + + def serialize(self): + """ Serialize instance into json-friendly format. """ + return { + 'recall': self.recall.tolist(), + 'precision': self.precision.tolist(), + 'confidence': self.confidence.tolist(), + 'trans_err': self.trans_err.tolist(), + 'vel_err': self.vel_err.tolist(), + 'scale_err': self.scale_err.tolist(), + 'orient_err': self.orient_err.tolist(), + 'attr_err': self.attr_err.tolist(), + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized content. """ + return cls(recall=np.array(content['recall']), + precision=np.array(content['precision']), + confidence=np.array(content['confidence']), + trans_err=np.array(content['trans_err']), + vel_err=np.array(content['vel_err']), + scale_err=np.array(content['scale_err']), + orient_err=np.array(content['orient_err']), + attr_err=np.array(content['attr_err'])) + + @classmethod + def no_predictions(cls): + """ Returns a md instance corresponding to having no predictions. """ + return cls(recall=np.linspace(0, 1, cls.nelem), + precision=np.zeros(cls.nelem), + confidence=np.zeros(cls.nelem), + trans_err=np.ones(cls.nelem), + vel_err=np.ones(cls.nelem), + scale_err=np.ones(cls.nelem), + orient_err=np.ones(cls.nelem), + attr_err=np.ones(cls.nelem)) + + @classmethod + def random_md(cls): + """ Returns an md instance corresponding to a random results. """ + return cls(recall=np.linspace(0, 1, cls.nelem), + precision=np.random.random(cls.nelem), + confidence=np.linspace(0, 1, cls.nelem)[::-1], + trans_err=np.random.random(cls.nelem), + vel_err=np.random.random(cls.nelem), + scale_err=np.random.random(cls.nelem), + orient_err=np.random.random(cls.nelem), + attr_err=np.random.random(cls.nelem)) + + +class DetectionMetrics: + """ Stores average precision and true positive metric results. Provides properties to summarize. """ + + def __init__(self, cfg: DetectionConfig): + + self.cfg = cfg + self._label_aps = defaultdict(lambda: defaultdict(float)) + self._label_tp_errors = defaultdict(lambda: defaultdict(float)) + self.eval_time = None + + def add_label_ap(self, detection_name: str, dist_th: float, ap: float) -> None: + self._label_aps[detection_name][dist_th] = ap + + def get_label_ap(self, detection_name: str, dist_th: float) -> float: + return self._label_aps[detection_name][dist_th] + + def add_label_tp(self, detection_name: str, metric_name: str, tp: float): + self._label_tp_errors[detection_name][metric_name] = tp + + def get_label_tp(self, detection_name: str, metric_name: str) -> float: + return self._label_tp_errors[detection_name][metric_name] + + def add_runtime(self, eval_time: float) -> None: + self.eval_time = eval_time + + @property + def mean_dist_aps(self) -> Dict[str, float]: + """ Calculates the mean over distance thresholds for each label. """ + return {class_name: np.mean(list(d.values())) for class_name, d in self._label_aps.items()} + + @property + def mean_ap(self) -> float: + """ Calculates the mean AP by averaging over distance thresholds and classes. """ + return float(np.mean(list(self.mean_dist_aps.values()))) + + @property + def tp_errors(self) -> Dict[str, float]: + """ Calculates the mean true positive error across all classes for each metric. """ + errors = {} + for metric_name in TP_METRICS: + class_errors = [] + for detection_name in self.cfg.class_names: + class_errors.append(self.get_label_tp(detection_name, metric_name)) + + errors[metric_name] = float(np.nanmean(class_errors)) + + return errors + + @property + def tp_scores(self) -> Dict[str, float]: + scores = {} + tp_errors = self.tp_errors + for metric_name in TP_METRICS: + + # We convert the true positive errors to "scores" by 1-error. + score = 1.0 - tp_errors[metric_name] + + # Some of the true positive errors are unbounded, so we bound the scores to min 0. + score = max(0.0, score) + + scores[metric_name] = score + + return scores + + @property + def nd_score(self) -> float: + """ + Compute the nuScenes detection score (NDS, weighted sum of the individual scores). + :return: The NDS. + """ + # Summarize. + total = float(self.cfg.mean_ap_weight * self.mean_ap + np.sum(list(self.tp_scores.values()))) + + # Normalize. + total = total / float(self.cfg.mean_ap_weight + len(self.tp_scores.keys())) + + return total + + def serialize(self): + return { + 'label_aps': self._label_aps, + 'mean_dist_aps': self.mean_dist_aps, + 'mean_ap': self.mean_ap, + 'label_tp_errors': self._label_tp_errors, + 'tp_errors': self.tp_errors, + 'tp_scores': self.tp_scores, + 'nd_score': self.nd_score, + 'eval_time': self.eval_time, + 'cfg': self.cfg.serialize() + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized dictionary. """ + + cfg = DetectionConfig.deserialize(content['cfg']) + + metrics = cls(cfg=cfg) + metrics.add_runtime(content['eval_time']) + + for detection_name, label_aps in content['label_aps'].items(): + for dist_th, ap in label_aps.items(): + metrics.add_label_ap(detection_name=detection_name, dist_th=float(dist_th), ap=float(ap)) + + for detection_name, label_tps in content['label_tp_errors'].items(): + for metric_name, tp in label_tps.items(): + metrics.add_label_tp(detection_name=detection_name, metric_name=metric_name, tp=float(tp)) + + return metrics + + def __eq__(self, other): + eq = True + eq = eq and self._label_aps == other._label_aps + eq = eq and self._label_tp_errors == other._label_tp_errors + eq = eq and self.eval_time == other.eval_time + eq = eq and self.cfg == other.cfg + + return eq + + +class DetectionBox(EvalBox): + """ Data class used during detection evaluation. Can be a prediction or ground truth.""" + + def __init__(self, + sample_token: str = "", + translation: Tuple[float, float, float] = (0, 0, 0), + size: Tuple[float, float, float] = (0, 0, 0), + rotation: Tuple[float, float, float, float] = (0, 0, 0, 0), + velocity: Tuple[float, float] = (0, 0), + ego_translation: [float, float, float] = (0, 0, 0), # Translation to ego vehicle in meters. + num_pts: int = -1, # Nbr. LIDAR or RADAR inside the box. Only for gt boxes. + detection_name: str = 'car', # The class name used in the detection challenge. + detection_score: float = -1.0, # GT samples do not have a score. + attribute_name: str = ''): # Box attribute. Each box can have at most 1 attribute. + + super().__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts) + + assert detection_name is not None, 'Error: detection_name cannot be empty!' + assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name + + assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \ + 'Error: Unknown attribute_name %s' % attribute_name + + assert isinstance(detection_score, float), 'Error: detection_score must be a float!' + assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!' + + # Assign. + self.detection_name = detection_name + self.detection_score = detection_score + self.attribute_name = attribute_name + + def __eq__(self, other): + return (self.sample_token == other.sample_token and + self.translation == other.translation and + self.size == other.size and + self.rotation == other.rotation and + self.velocity == other.velocity and + self.ego_translation == other.ego_translation and + self.num_pts == other.num_pts and + self.detection_name == other.detection_name and + self.detection_score == other.detection_score and + self.attribute_name == other.attribute_name) + + def serialize(self) -> dict: + """ Serialize instance into json-friendly format. """ + return { + 'sample_token': self.sample_token, + 'translation': self.translation, + 'size': self.size, + 'rotation': self.rotation, + 'velocity': self.velocity, + 'ego_translation': self.ego_translation, + 'num_pts': self.num_pts, + 'detection_name': self.detection_name, + 'detection_score': self.detection_score, + 'attribute_name': self.attribute_name + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized content. """ + return cls(sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name']) + + +class DetectionMetricDataList: + """ This stores a set of MetricData in a dict indexed by (name, match-distance). """ + + def __init__(self): + self.md = {} + + def __getitem__(self, key): + return self.md[key] + + def __eq__(self, other): + eq = True + for key in self.md.keys(): + eq = eq and self[key] == other[key] + return eq + + def get_class_data(self, detection_name: str) -> List[Tuple[DetectionMetricData, float]]: + """ Get all the MetricData entries for a certain detection_name. """ + return [(md, dist_th) for (name, dist_th), md in self.md.items() if name == detection_name] + + def get_dist_data(self, dist_th: float) -> List[Tuple[DetectionMetricData, str]]: + """ Get all the MetricData entries for a certain match_distance. """ + return [(md, detection_name) for (detection_name, dist), md in self.md.items() if dist == dist_th] + + def set(self, detection_name: str, match_distance: float, data: DetectionMetricData): + """ Sets the MetricData entry for a certain detection_name and match_distance. """ + self.md[(detection_name, match_distance)] = data + + def serialize(self) -> dict: + return {key[0] + ':' + str(key[1]): value.serialize() for key, value in self.md.items()} + + @classmethod + def deserialize(cls, content: dict): + mdl = cls() + for key, md in content.items(): + name, distance = key.split(':') + mdl.set(name, float(distance), DetectionMetricData.deserialize(md)) + return mdl diff --git a/model_examples/MapTR/projects/__init__.py b/model_examples/MapTR/projects/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/coco_instance.py b/model_examples/MapTR/projects/configs/_base_/datasets/coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..f6ea4f4562a8118275a444879a884717b55caa15 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/coco_instance.py @@ -0,0 +1,48 @@ +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-3class.py b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-3class.py new file mode 100644 index 0000000000000000000000000000000000000000..1822af4209432eb45e105112a165668fac87b6c5 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-3class.py @@ -0,0 +1,140 @@ +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Pedestrian', 'Cyclist', 'Car'] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), + classes=class_names, + sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6)) + +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://kitti_data/')) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + translation_std=[1.0, 1.0, 0.5], + global_rot_range=[0.0, 0.0], + rot_range=[-0.78539816, 0.78539816]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=1, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-car.py b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-car.py new file mode 100644 index 0000000000000000000000000000000000000000..1e81226e2dfdb0e4e802daa8bf0c9f9d19adb125 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-car.py @@ -0,0 +1,138 @@ +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Car'] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), + classes=class_names, + sample_groups=dict(Car=15)) + +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://kitti_data/')) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + translation_std=[1.0, 1.0, 0.5], + global_rot_range=[0.0, 0.0], + rot_range=[-0.78539816, 0.78539816]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=1, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/lyft-3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/lyft-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..71baff04c5b5345ab3d7340607c3496a8befc5fa --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/lyft-3d.py @@ -0,0 +1,136 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-80, -80, -5, 80, 80, 3] +# For Lyft we usually do 9-class detection +class_names = [ + 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', + 'bicycle', 'pedestrian', 'animal' +] +dataset_type = 'LyftDataset' +data_root = 'data/lyft/' +# Input modality for Lyft dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/lyft/': 's3://lyft/lyft/', +# 'data/lyft/': 's3://lyft/lyft/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_test.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True)) +# For Lyft dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/nuim_instance.py b/model_examples/MapTR/projects/configs/_base_/datasets/nuim_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..82fce56bf6f2ad2578a0426e71fc13c2feb8bf97 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/nuim_instance.py @@ -0,0 +1,59 @@ +dataset_type = 'CocoDataset' +data_root = 'data/nuimages/' +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1280, 720), (1920, 1080)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1600, 900), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/nuimages_v1.0-train.json', + img_prefix=data_root, + classes=class_names, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/nuimages_v1.0-val.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/nuimages_v1.0-val.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/nus-3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/nus-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..154817175df8de5768c1d56bc35efaa0da99415c --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/nus-3d.py @@ -0,0 +1,142 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-50, -50, -5, 50, 50, 3] +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +dataset_type = 'NuScenesDataset' +data_root = 'data/nuscenes/' +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/nuscenes/': 's3://nuscenes/nuscenes/', +# 'data/nuscenes/': 's3://nuscenes/nuscenes/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR')) +# For nuScenes dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/nus-mono3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/nus-mono3d.py new file mode 100644 index 0000000000000000000000000000000000000000..1363a94ce4fbb3b1014e61dd52bc36408f119ce1 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/nus-mono3d.py @@ -0,0 +1,100 @@ +dataset_type = 'CustomNuScenesMonoDataset' +data_root = 'data/nuscenes/' +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=True, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', + 'gt_labels_3d', 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=False, + transforms=[ + dict(type='RandomFlip3D'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=train_pipeline, + modality=input_modality, + test_mode=False, + box_type_3d='Camera'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera')) +evaluation = dict(interval=2) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/range100_lyft-3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/range100_lyft-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..efa63ea3f0d351198d609785d971c19d96532844 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/range100_lyft-3d.py @@ -0,0 +1,136 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-100, -100, -5, 100, 100, 3] +# For Lyft we usually do 9-class detection +class_names = [ + 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', + 'bicycle', 'pedestrian', 'animal' +] +dataset_type = 'LyftDataset' +data_root = 'data/lyft/' +# Input modality for Lyft dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/lyft/': 's3://lyft/lyft/', +# 'data/lyft/': 's3://lyft/lyft/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_test.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True)) +# For Lyft dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/s3dis-3d-5class.py b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis-3d-5class.py new file mode 100644 index 0000000000000000000000000000000000000000..2422766fa351ee5cf7f0cd5ee5ab61b88e1d0300 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis-3d-5class.py @@ -0,0 +1,114 @@ +# dataset settings +dataset_type = 'S3DISDataset' +data_root = './data/s3dis/' +class_names = ('table', 'chair', 'sofa', 'bookcase', 'board') +train_area = [1, 2, 3, 4, 6] +test_area = 5 + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict(type='PointSample', num_points=40000), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + # following ScanNet dataset the rotation range is 5 degrees + rot_range=[-0.087266, 0.087266], + scale_ratio_range=[1.0, 1.0], + shift_height=True), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointSample', num_points=40000), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + f's3dis_infos_Area_{i}.pkl', + pipeline=train_pipeline, + filter_empty_gt=False, + classes=class_names, + box_type_3d='Depth') for i in train_area + ], + separate_eval=False)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py new file mode 100644 index 0000000000000000000000000000000000000000..39bf5568e01d1a781c1b712e7c20b823e7c90141 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py @@ -0,0 +1,139 @@ +# dataset settings +dataset_type = 'S3DISSegDataset' +data_root = './data/s3dis/' +class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door', + 'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter') +num_points = 4096 +train_area = [1, 2, 3, 4, 6] +test_area = 5 +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=tuple(range(len(class_names))), + max_cat_id=13), + dict( + type='IndoorPatchPointSample', + num_points=num_points, + block_size=1.0, + ignore_index=len(class_names), + use_normalized_coord=True, + enlarge_size=0.2, + min_unique_num=None), + dict(type='NormalizePointsColor', color_mean=None), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict(type='NormalizePointsColor', color_mean=None), + dict( + # a wrapper in order to successfully call test function + # actually we don't perform test-time-aug + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.0, + flip_ratio_bev_vertical=0.0), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +# we need to load gt seg_mask! +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=tuple(range(len(class_names))), + max_cat_id=13), + dict( + type='DefaultFormatBundle3D', + with_label=False, + class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + # train on area 1, 2, 3, 4, 6 + # test on area 5 + train=dict( + type=dataset_type, + data_root=data_root, + ann_files=[ + data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area + ], + pipeline=train_pipeline, + classes=class_names, + test_mode=False, + ignore_index=len(class_names), + scene_idxs=[ + data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy' + for i in train_area + ]), + val=dict( + type=dataset_type, + data_root=data_root, + ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names), + scene_idxs=data_root + + f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names))) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/scannet-3d-18class.py b/model_examples/MapTR/projects/configs/_base_/datasets/scannet-3d-18class.py new file mode 100644 index 0000000000000000000000000000000000000000..93da1e5870561363fb3686e8288ccf561ca72cd2 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/scannet-3d-18class.py @@ -0,0 +1,128 @@ +# dataset settings +dataset_type = 'ScanNetDataset' +data_root = './data/scannet/' +class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', + 'bookshelf', 'picture', 'counter', 'desk', 'curtain', + 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', + 'garbagebin') +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_mask_3d=True, + with_seg_3d=True), + dict(type='GlobalAlignment', rotation_axis=2), + dict( + type='PointSegClassMapping', + valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, + 36, 39), + max_cat_id=40), + dict(type='PointSample', num_points=40000), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.087266, 0.087266], + scale_ratio_range=[1.0, 1.0], + shift_height=True), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask', + 'pts_instance_mask' + ]) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict(type='GlobalAlignment', rotation_axis=2), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointSample', num_points=40000), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + load_dim=6, + use_dim=[0, 1, 2]), + dict(type='GlobalAlignment', rotation_axis=2), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_train.pkl', + pipeline=train_pipeline, + filter_empty_gt=False, + classes=class_names, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/scannet_seg-3d-20class.py b/model_examples/MapTR/projects/configs/_base_/datasets/scannet_seg-3d-20class.py new file mode 100644 index 0000000000000000000000000000000000000000..cf73b09c8afa9317fa7077f5f67b1fae3306c1b7 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/scannet_seg-3d-20class.py @@ -0,0 +1,132 @@ +# dataset settings +dataset_type = 'ScanNetSegDataset' +data_root = './data/scannet/' +class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', + 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', + 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', + 'bathtub', 'otherfurniture') +num_points = 8192 +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, + 33, 34, 36, 39), + max_cat_id=40), + dict( + type='IndoorPatchPointSample', + num_points=num_points, + block_size=1.5, + ignore_index=len(class_names), + use_normalized_coord=False, + enlarge_size=0.2, + min_unique_num=None), + dict(type='NormalizePointsColor', color_mean=None), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict(type='NormalizePointsColor', color_mean=None), + dict( + # a wrapper in order to successfully call test function + # actually we don't perform test-time-aug + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.0, + flip_ratio_bev_vertical=0.0), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +# we need to load gt seg_mask! +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, + 33, 34, 36, 39), + max_cat_id=40), + dict( + type='DefaultFormatBundle3D', + with_label=False, + class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + test_mode=False, + ignore_index=len(class_names), + scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names)), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names))) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/sunrgbd-3d-10class.py b/model_examples/MapTR/projects/configs/_base_/datasets/sunrgbd-3d-10class.py new file mode 100644 index 0000000000000000000000000000000000000000..7121b75bbf0679c55f706ed07294eb2fa3495cc0 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/sunrgbd-3d-10class.py @@ -0,0 +1,107 @@ +dataset_type = 'SUNRGBDDataset' +data_root = 'data/sunrgbd/' +class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', + 'night_stand', 'bookshelf', 'bathtub') +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict(type='LoadAnnotations3D'), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + ), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.523599, 0.523599], + scale_ratio_range=[0.85, 1.15], + shift_height=True), + dict(type='PointSample', num_points=20000), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + ), + dict(type='PointSample', num_points=20000), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + load_dim=6, + use_dim=[0, 1, 2]), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=16, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + filter_empty_gt=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-3class.py b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-3class.py new file mode 100644 index 0000000000000000000000000000000000000000..920ac154d68cb07669642300fafd52d179be5392 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-3class.py @@ -0,0 +1,145 @@ +# dataset settings +# D5 in the config name means the whole dataset is divided into 5 folds +# We only use one fold for efficient experiments +dataset_type = 'LidarWaymoDataset' +data_root = 'data/waymo-full/kitti_format/' +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://waymo_data/')) + +class_names = ['Car', 'Pedestrian', 'Cyclist'] +point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'waymo_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), + classes=class_names, + sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_train.pkl', + split='training', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + # load one frame every five frames + load_interval=5)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-car.py b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-car.py new file mode 100644 index 0000000000000000000000000000000000000000..02e262721b29ede7e29d0d0046eba243f2c82249 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-car.py @@ -0,0 +1,143 @@ +# dataset settings +# D5 in the config name means the whole dataset is divided into 5 folds +# We only use one fold for efficient experiments +dataset_type = 'WaymoDataset' +data_root = 'data/waymo/kitti_format/' +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://waymo_data/')) + +class_names = ['Car'] +point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'waymo_dbinfos_train.pkl', + rate=1.0, + prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), + classes=class_names, + sample_groups=dict(Car=15), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_train.pkl', + split='training', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + # load one frame every five frames + load_interval=5)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/_base_/default_runtime.py b/model_examples/MapTR/projects/configs/_base_/default_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..4e85b69abed5f51238da4f183163066073664350 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/default_runtime.py @@ -0,0 +1,18 @@ +checkpoint_config = dict(interval=1) +# yapf:disable push +# By default we use textlogger hook and tensorboard +# For more loggers see +# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = None +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/model_examples/MapTR/projects/configs/_base_/models/3dssd.py b/model_examples/MapTR/projects/configs/_base_/models/3dssd.py new file mode 100644 index 0000000000000000000000000000000000000000..55344c7ddff660dc0306542d94260efad39f8df7 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/3dssd.py @@ -0,0 +1,77 @@ +model = dict( + type='SSD3DNet', + backbone=dict( + type='PointNet2SAMSG', + in_channels=4, + num_points=(4096, 512, (256, 256)), + radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)), + num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)), + sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)), + ((64, 64, 128), (64, 64, 128), (64, 96, 128)), + ((128, 128, 256), (128, 192, 256), (128, 256, 256))), + aggregation_channels=(64, 128, 256), + fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')), + fps_sample_range_lists=((-1), (-1), (512, -1)), + norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), + sa_cfg=dict( + type='PointSAModuleMSG', + pool_mod='max', + use_xyz=True, + normalize_xyz=False)), + bbox_head=dict( + type='SSD3DHead', + in_channels=256, + vote_module_cfg=dict( + in_channels=256, + num_points=256, + gt_per_seed=1, + conv_channels=(128, ), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), + with_res_feat=False, + vote_xyz_range=(3.0, 3.0, 2.0)), + vote_aggregation_cfg=dict( + type='PointSAModuleMSG', + num_point=256, + radii=(4.8, 6.4), + sample_nums=(16, 32), + mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)), + norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), + use_xyz=True, + normalize_xyz=False, + bias=True), + pred_layer_cfg=dict( + in_channels=1536, + shared_conv_channels=(512, 128), + cls_conv_channels=(128, ), + reg_conv_channels=(128, ), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), + bias=True), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), + objectness_loss=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + center_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + corner_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05), + test_cfg=dict( + nms_cfg=dict(type='nms', iou_thr=0.1), + sample_mod='spec', + score_thr=0.0, + per_class_proposal=True, + max_output_num=100)) diff --git a/model_examples/MapTR/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/model_examples/MapTR/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..fb9e0a8f06d3f597e90156efc9f30264c678fe85 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py @@ -0,0 +1,200 @@ +# model settings +model = dict( + type='CascadeRCNN', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py new file mode 100644 index 0000000000000000000000000000000000000000..efdce59c6d59c6564c6558a7a800852fe14314d7 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py @@ -0,0 +1,83 @@ +voxel_size = [0.1, 0.1, 0.2] +model = dict( + type='CenterPoint', + pts_voxel_layer=dict( + max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)), + pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), + pts_middle_encoder=dict( + type='SparseEncoder', + in_channels=5, + sparse_shape=[41, 1024, 1024], + output_channels=128, + order=('conv', 'norm', 'act'), + encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, + 128)), + encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), + block_type='basicblock'), + pts_backbone=dict( + type='SECOND', + in_channels=256, + out_channels=[128, 256], + layer_nums=[5, 5], + layer_strides=[1, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + out_channels=[256, 256], + upsample_strides=[1, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='CenterHead', + in_channels=sum([256, 256]), + tasks=[ + dict(num_class=1, class_names=['car']), + dict(num_class=2, class_names=['truck', 'construction_vehicle']), + dict(num_class=2, class_names=['bus', 'trailer']), + dict(num_class=1, class_names=['barrier']), + dict(num_class=2, class_names=['motorcycle', 'bicycle']), + dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), + ], + common_heads=dict( + reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), + share_conv_channel=64, + bbox_coder=dict( + type='CenterPointBBoxCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_num=500, + score_threshold=0.1, + out_size_factor=8, + voxel_size=voxel_size[:2], + code_size=9), + separate_head=dict( + type='SeparateHead', init_bias=-2.19, final_kernel=3), + loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), + loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), + norm_bbox=True), + # model training and testing settings + train_cfg=dict( + pts=dict( + grid_size=[1024, 1024, 40], + voxel_size=voxel_size, + out_size_factor=8, + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), + test_cfg=dict( + pts=dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + max_pool_nms=False, + min_radius=[4, 12, 10, 1, 0.85, 0.175], + score_threshold=0.1, + out_size_factor=8, + voxel_size=voxel_size[:2], + nms_type='rotate', + pre_max_size=1000, + post_max_size=83, + nms_thr=0.2))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py new file mode 100644 index 0000000000000000000000000000000000000000..311d76373bd261ed8827409be68db0e577b38327 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py @@ -0,0 +1,83 @@ +voxel_size = [0.2, 0.2, 8] +model = dict( + type='CenterPoint', + pts_voxel_layer=dict( + max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)), + pts_voxel_encoder=dict( + type='PillarFeatureNet', + in_channels=5, + feat_channels=[64], + with_distance=False, + voxel_size=(0.2, 0.2, 8), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), + legacy=False), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), + pts_backbone=dict( + type='SECOND', + in_channels=64, + out_channels=[64, 128, 256], + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + out_channels=[128, 128, 128], + upsample_strides=[0.5, 1, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='CenterHead', + in_channels=sum([128, 128, 128]), + tasks=[ + dict(num_class=1, class_names=['car']), + dict(num_class=2, class_names=['truck', 'construction_vehicle']), + dict(num_class=2, class_names=['bus', 'trailer']), + dict(num_class=1, class_names=['barrier']), + dict(num_class=2, class_names=['motorcycle', 'bicycle']), + dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), + ], + common_heads=dict( + reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), + share_conv_channel=64, + bbox_coder=dict( + type='CenterPointBBoxCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_num=500, + score_threshold=0.1, + out_size_factor=4, + voxel_size=voxel_size[:2], + code_size=9), + separate_head=dict( + type='SeparateHead', init_bias=-2.19, final_kernel=3), + loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), + loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), + norm_bbox=True), + # model training and testing settings + train_cfg=dict( + pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + out_size_factor=4, + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), + test_cfg=dict( + pts=dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + max_pool_nms=False, + min_radius=[4, 12, 10, 1, 0.85, 0.175], + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=4, + voxel_size=voxel_size[:2], + nms_type='rotate', + pre_max_size=1000, + post_max_size=83, + nms_thr=0.2))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/fcos3d.py b/model_examples/MapTR/projects/configs/_base_/models/fcos3d.py new file mode 100644 index 0000000000000000000000000000000000000000..92ea90760519d6205d75af6a39f927503de89aad --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/fcos3d.py @@ -0,0 +1,74 @@ +model = dict( + type='FCOSMono3D', + pretrained='open-mmlab://detectron2/resnet101_caffe', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSMono3DHead', + num_classes=10, + in_channels=256, + stacked_convs=2, + feat_channels=256, + use_direction_classifier=True, + diff_rad_by_sin=True, + pred_attrs=True, + pred_velo=True, + dir_offset=0.7854, # pi/4 + strides=[8, 16, 32, 64, 128], + group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo + cls_branch=(256, ), + reg_branch=( + (256, ), # offset + (256, ), # depth + (256, ), # size + (256, ), # rot + () # velo + ), + dir_branch=(256, ), + attr_branch=(256, ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_attr=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + norm_on_bbox=True, + centerness_on_reg=True, + center_sampling=True, + conv_bias=True, + dcn_on_last_conv=True), + train_cfg=dict( + allowed_border=0, + code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05], + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=1000, + nms_thr=0.8, + score_thr=0.05, + min_bbox_size=0, + max_per_img=200)) diff --git a/model_examples/MapTR/projects/configs/_base_/models/groupfree3d.py b/model_examples/MapTR/projects/configs/_base_/models/groupfree3d.py new file mode 100644 index 0000000000000000000000000000000000000000..077d049662fe16b91639af4a5923a4e8e540148d --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/groupfree3d.py @@ -0,0 +1,71 @@ +model = dict( + type='GroupFree3DNet', + backbone=dict( + type='PointNet2SASSG', + in_channels=3, + num_points=(2048, 1024, 512, 256), + radius=(0.2, 0.4, 0.8, 1.2), + num_samples=(64, 32, 16, 16), + sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), + (128, 128, 256)), + fp_channels=((256, 256), (256, 288)), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True)), + bbox_head=dict( + type='GroupFree3DHead', + in_channels=288, + num_decoder_layers=6, + num_proposal=256, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='GroupFree3DMHA', + embed_dims=288, + num_heads=8, + attn_drop=0.1, + dropout_layer=dict(type='Dropout', drop_prob=0.1)), + ffn_cfgs=dict( + embed_dims=288, + feedforward_channels=2048, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', + 'norm')), + pred_layer_cfg=dict( + in_channels=288, shared_conv_channels=(288, 288), bias=True), + sampling_objectness_loss=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=8.0), + objectness_loss=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + center_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(sample_mod='kps'), + test_cfg=dict( + sample_mod='kps', + nms_thr=0.25, + score_thr=0.0, + per_class_proposal=True, + prediction_stages='last')) diff --git a/model_examples/MapTR/projects/configs/_base_/models/h3dnet.py b/model_examples/MapTR/projects/configs/_base_/models/h3dnet.py new file mode 100644 index 0000000000000000000000000000000000000000..760566744f6484cde261f87f0d95a1182786779c --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/h3dnet.py @@ -0,0 +1,341 @@ +primitive_z_cfg = dict( + type='PrimitiveHead', + num_dims=2, + num_classes=18, + primitive_mode='z', + upper_thresh=100.0, + surface_thresh=0.5, + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=1, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=1024, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.4, 0.6], + reduction='mean', + loss_weight=30.0), + center_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_reg_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_cls_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + train_cfg=dict( + dist_thresh=0.2, + var_thresh=1e-2, + lower_thresh=1e-6, + num_point=100, + num_point_line=10, + line_thresh=0.2)) + +primitive_xy_cfg = dict( + type='PrimitiveHead', + num_dims=1, + num_classes=18, + primitive_mode='xy', + upper_thresh=100.0, + surface_thresh=0.5, + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=1, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=1024, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.4, 0.6], + reduction='mean', + loss_weight=30.0), + center_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_reg_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_cls_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + train_cfg=dict( + dist_thresh=0.2, + var_thresh=1e-2, + lower_thresh=1e-6, + num_point=100, + num_point_line=10, + line_thresh=0.2)) + +primitive_line_cfg = dict( + type='PrimitiveHead', + num_dims=0, + num_classes=18, + primitive_mode='line', + upper_thresh=100.0, + surface_thresh=0.5, + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=1, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=1024, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.4, 0.6], + reduction='mean', + loss_weight=30.0), + center_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=1.0, + loss_dst_weight=1.0), + semantic_reg_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=1.0, + loss_dst_weight=1.0), + semantic_cls_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=2.0), + train_cfg=dict( + dist_thresh=0.2, + var_thresh=1e-2, + lower_thresh=1e-6, + num_point=100, + num_point_line=10, + line_thresh=0.2)) + +model = dict( + type='H3DNet', + backbone=dict( + type='MultiBackbone', + num_streams=4, + suffixes=['net0', 'net1', 'net2', 'net3'], + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01), + act_cfg=dict(type='ReLU'), + backbones=dict( + type='PointNet2SASSG', + in_channels=4, + num_points=(2048, 1024, 512, 256), + radius=(0.2, 0.4, 0.8, 1.2), + num_samples=(64, 32, 16, 16), + sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), + (128, 128, 256)), + fp_channels=((256, 256), (256, 256)), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True))), + rpn_head=dict( + type='VoteHead', + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=3, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=256, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + pred_layer_cfg=dict( + in_channels=128, shared_conv_channels=(128, 128), bias=True), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='sum', + loss_weight=5.0), + center_loss=dict( + type='ChamferDistance', + mode='l2', + reduction='sum', + loss_src_weight=10.0, + loss_dst_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), + roi_head=dict( + type='H3DRoIHead', + primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg], + bbox_head=dict( + type='H3DBboxHead', + gt_per_seed=3, + num_proposal=256, + suface_matching_cfg=dict( + type='PointSAModule', + num_point=256 * 6, + radius=0.5, + num_sample=32, + mlp_channels=[128 + 6, 128, 64, 32], + use_xyz=True, + normalize_xyz=True), + line_matching_cfg=dict( + type='PointSAModule', + num_point=256 * 12, + radius=0.5, + num_sample=32, + mlp_channels=[128 + 12, 128, 64, 32], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + primitive_refine_channels=[128, 128, 128], + upper_thresh=100.0, + surface_thresh=0.5, + line_thresh=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='sum', + loss_weight=5.0), + center_loss=dict( + type='ChamferDistance', + mode='l2', + reduction='sum', + loss_src_weight=10.0, + loss_dst_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), + cues_objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.3, 0.7], + reduction='mean', + loss_weight=5.0), + cues_semantic_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.3, 0.7], + reduction='mean', + loss_weight=5.0), + proposal_objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='none', + loss_weight=5.0), + primitive_center_loss=dict( + type='MSELoss', reduction='none', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), + rpn_proposal=dict(use_nms=False), + rcnn=dict( + pos_distance_thr=0.3, + neg_distance_thr=0.6, + sample_mod='vote', + far_threshold=0.6, + near_threshold=0.3, + mask_surface_threshold=0.3, + label_surface_threshold=0.3, + mask_line_threshold=0.3, + label_line_threshold=0.3)), + test_cfg=dict( + rpn=dict( + sample_mod='seed', + nms_thr=0.25, + score_thr=0.05, + per_class_proposal=True, + use_nms=False), + rcnn=dict( + sample_mod='seed', + nms_thr=0.25, + score_thr=0.05, + per_class_proposal=True))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py new file mode 100644 index 0000000000000000000000000000000000000000..87c7fe0c6145f0cceadafd7f51c98f209538796d --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py @@ -0,0 +1,22 @@ +_base_ = './hv_pointpillars_fpn_nus.py' + +# model settings (based on nuScenes model settings) +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +model = dict( + pts_voxel_layer=dict( + max_num_points=20, + point_cloud_range=[-80, -80, -5, 80, 80, 3], + max_voxels=(60000, 60000)), + pts_voxel_encoder=dict( + feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]), + pts_middle_encoder=dict(output_shape=[640, 640]), + pts_bbox_head=dict( + num_classes=9, + anchor_generator=dict( + ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]), + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), + # model training settings (based on nuScenes model settings) + train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py new file mode 100644 index 0000000000000000000000000000000000000000..e153f6c6e69171d29f79b627dd6d152a842d0db2 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py @@ -0,0 +1,96 @@ +# model settings +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +voxel_size = [0.25, 0.25, 8] +model = dict( + type='MVXFasterRCNN', + pts_voxel_layer=dict( + max_num_points=64, + point_cloud_range=[-50, -50, -5, 50, 50, 3], + voxel_size=voxel_size, + max_voxels=(30000, 40000)), + pts_voxel_encoder=dict( + type='HardVFE', + in_channels=4, + feat_channels=[64, 64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=[-50, -50, -5, 50, 50, 3], + norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]), + pts_backbone=dict( + type='SECOND', + in_channels=64, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + out_channels=[64, 128, 256]), + pts_neck=dict( + type='FPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + act_cfg=dict(type='ReLU'), + in_channels=[64, 128, 256], + out_channels=256, + start_level=0, + num_outs=3), + pts_bbox_head=dict( + type='Anchor3DHead', + num_classes=10, + in_channels=256, + feat_channels=256, + use_direction_classifier=True, + anchor_generator=dict( + type='AlignedAnchor3DRangeGenerator', + ranges=[[-50, -50, -1.8, 50, 50, -1.8]], + scales=[1, 2, 4], + sizes=[ + [0.8660, 2.5981, 1.], # 1.5/sqrt(3) + [0.5774, 1.7321, 1.], # 1/sqrt(3) + [1., 1., 1.], + [0.4, 0.4, 1], + ], + custom_values=[0, 0], + rotations=[0, 1.57], + reshape_out=True), + assigner_per_size=False, + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + pts=dict( + assigner=dict( + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + pos_weight=-1, + debug=False)), + test_cfg=dict( + pts=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=1000, + nms_thr=0.2, + score_thr=0.05, + min_bbox_size=0, + max_num=500))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py new file mode 100644 index 0000000000000000000000000000000000000000..9cd200f3e4c0dfb7da1823263b22bbcd63d77d63 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py @@ -0,0 +1,22 @@ +_base_ = './hv_pointpillars_fpn_nus.py' + +# model settings (based on nuScenes model settings) +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +model = dict( + pts_voxel_layer=dict( + max_num_points=20, + point_cloud_range=[-100, -100, -5, 100, 100, 3], + max_voxels=(60000, 60000)), + pts_voxel_encoder=dict( + feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]), + pts_middle_encoder=dict(output_shape=[800, 800]), + pts_bbox_head=dict( + num_classes=9, + anchor_generator=dict( + ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]), + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), + # model training settings (based on nuScenes model settings) + train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py new file mode 100644 index 0000000000000000000000000000000000000000..85076d0798bc49e1564d6eabe177d1ae92be0aef --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py @@ -0,0 +1,93 @@ +voxel_size = [0.16, 0.16, 4] + +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=32, # max_points_per_voxel + point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1], + voxel_size=voxel_size, + max_voxels=(16000, 40000) # (training, testing) max_voxels + ), + voxel_encoder=dict( + type='PillarFeatureNet', + in_channels=4, + feat_channels=[64], + with_distance=False, + voxel_size=voxel_size, + point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]), + middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]), + backbone=dict( + type='SECOND', + in_channels=64, + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + out_channels=[64, 128, 256]), + neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + out_channels=[128, 128, 128]), + bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[ + [0, -39.68, -0.6, 70.4, 39.68, -0.6], + [0, -39.68, -0.6, 70.4, 39.68, -0.6], + [0, -39.68, -1.78, 70.4, 39.68, -1.78], + ], + sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + ], + allowed_border=0, + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.1, + min_bbox_size=0, + nms_pre=100, + max_num=50)) diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py new file mode 100644 index 0000000000000000000000000000000000000000..14873ead474761d96b8487d48765bf2486277bed --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py @@ -0,0 +1,108 @@ +# model settings +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +voxel_size = [0.32, 0.32, 6] +model = dict( + type='MVXFasterRCNN', + pts_voxel_layer=dict( + max_num_points=20, + point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], + voxel_size=voxel_size, + max_voxels=(32000, 32000)), + pts_voxel_encoder=dict( + type='HardVFE', + in_channels=5, + feat_channels=[64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], + norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]), + pts_backbone=dict( + type='SECOND', + in_channels=64, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[3, 5, 5], + layer_strides=[1, 2, 2], + out_channels=[64, 128, 256]), + pts_neck=dict( + type='SECONDFPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + out_channels=[128, 128, 128]), + pts_bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + anchor_generator=dict( + type='AlignedAnchor3DRangeGenerator', + ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], + [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], + [-74.88, -74.88, 0, 74.88, 74.88, 0]], + sizes=[ + [2.08, 4.73, 1.77], # car + [0.84, 1.81, 1.77], # cyclist + [0.84, 0.91, 1.74] # pedestrian + ], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + pts=dict( + assigner=[ + dict( # car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.55, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + dict( # cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + dict( # pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ], + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + pos_weight=-1, + debug=False)), + test_cfg=dict( + pts=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=4096, + nms_thr=0.25, + score_thr=0.1, + min_bbox_size=0, + max_num=500))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_kitti.py b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_kitti.py new file mode 100644 index 0000000000000000000000000000000000000000..6bf18abe1df08680cc2bb86dfb7b445af4d63ec8 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_kitti.py @@ -0,0 +1,89 @@ +voxel_size = [0.05, 0.05, 0.1] + +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=5, + point_cloud_range=[0, -40, -3, 70.4, 40, 1], + voxel_size=voxel_size, + max_voxels=(16000, 40000)), + voxel_encoder=dict(type='HardSimpleVFE'), + middle_encoder=dict( + type='SparseEncoder', + in_channels=4, + sparse_shape=[41, 1600, 1408], + order=('conv', 'norm', 'act')), + backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + out_channels=[128, 256]), + neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + out_channels=[256, 256]), + bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[ + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -1.78, 70.4, 40.0, -1.78], + ], + sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + ], + allowed_border=0, + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.1, + min_bbox_size=0, + nms_pre=100, + max_num=50)) diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_waymo.py b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_waymo.py new file mode 100644 index 0000000000000000000000000000000000000000..eb9bd3ae5cd6c94e56aa9d88765746853ca58f3e --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_waymo.py @@ -0,0 +1,100 @@ +# model settings +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +voxel_size = [0.08, 0.08, 0.1] +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=10, + point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4], + voxel_size=voxel_size, + max_voxels=(80000, 90000)), + voxel_encoder=dict(type='HardSimpleVFE', num_features=5), + middle_encoder=dict( + type='SparseEncoder', + in_channels=5, + sparse_shape=[61, 1280, 1920], + order=('conv', 'norm', 'act')), + backbone=dict( + type='SECOND', + in_channels=384, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[5, 5], + layer_strides=[1, 2], + out_channels=[128, 256]), + neck=dict( + type='SECONDFPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + in_channels=[128, 256], + upsample_strides=[1, 2], + out_channels=[256, 256]), + bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + anchor_generator=dict( + type='AlignedAnchor3DRangeGenerator', + ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345], + [-76.8, -51.2, 0, 76.8, 51.2, 0], + [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]], + sizes=[ + [2.08, 4.73, 1.77], # car + [0.84, 0.91, 1.74], # pedestrian + [0.84, 1.81, 1.77] # cyclist + ], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + assigner=[ + dict( # car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.55, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + dict( # pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + dict( # cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1) + ], + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=4096, + nms_thr=0.25, + score_thr=0.1, + min_bbox_size=0, + max_num=500)) diff --git a/model_examples/MapTR/projects/configs/_base_/models/imvotenet_image.py b/model_examples/MapTR/projects/configs/_base_/models/imvotenet_image.py new file mode 100644 index 0000000000000000000000000000000000000000..981f8bc9be90a3c2d0ff1edfef3cb3ce91d20d41 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/imvotenet_image.py @@ -0,0 +1,108 @@ +model = dict( + type='ImVoteNet', + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + img_neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + img_rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + img_roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=10, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + + # model training and testing settings + train_cfg=dict( + img_rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + img_rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + img_rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + img_rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + img_rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/mask_rcnn_r50_fpn.py b/model_examples/MapTR/projects/configs/_base_/models/mask_rcnn_r50_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d5e32b0427cf29b7240b26c7f506c283ae6c04 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/mask_rcnn_r50_fpn.py @@ -0,0 +1,124 @@ +# model settings +model = dict( + type='MaskRCNN', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/paconv_cuda_ssg.py b/model_examples/MapTR/projects/configs/_base_/models/paconv_cuda_ssg.py new file mode 100644 index 0000000000000000000000000000000000000000..f513bd4a2f94964f70dba926ef03b427a795e417 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/paconv_cuda_ssg.py @@ -0,0 +1,7 @@ +_base_ = './paconv_ssg.py' + +model = dict( + backbone=dict( + sa_cfg=dict( + type='PAConvCUDASAModule', + scorenet_cfg=dict(mlp_channels=[8, 16, 16])))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/paconv_ssg.py b/model_examples/MapTR/projects/configs/_base_/models/paconv_ssg.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4f1ed39373b40e0871bc97dafaf664ff68594d --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/paconv_ssg.py @@ -0,0 +1,49 @@ +# model settings +model = dict( + type='EncoderDecoder3D', + backbone=dict( + type='PointNet2SASSG', + in_channels=9, # [xyz, rgb, normalized_xyz] + num_points=(1024, 256, 64, 16), + radius=(None, None, None, None), # use kNN instead of ball query + num_samples=(32, 32, 32, 32), + sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, + 512)), + fp_channels=(), + norm_cfg=dict(type='BN2d', momentum=0.1), + sa_cfg=dict( + type='PAConvSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=False, + paconv_num_kernels=[16, 16, 16], + paconv_kernel_input='w_neighbor', + scorenet_input='w_neighbor_dist', + scorenet_cfg=dict( + mlp_channels=[16, 16, 16], + score_norm='softmax', + temp_factor=1.0, + last_bn=False))), + decode_head=dict( + type='PAConvHead', + # PAConv model's decoder takes skip connections from beckbone + # different from PointNet++, it also concats input features in the last + # level of decoder, leading to `128 + 6` as the channel number + fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), + (128 + 6, 128, 128, 128)), + channels=128, + dropout_ratio=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'), + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + class_weight=None, # should be modified with dataset + loss_weight=1.0)), + # correlation loss to regularize PAConv's kernel weights + loss_regularization=dict( + type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide')) diff --git a/model_examples/MapTR/projects/configs/_base_/models/parta2.py b/model_examples/MapTR/projects/configs/_base_/models/parta2.py new file mode 100644 index 0000000000000000000000000000000000000000..6c5ae9a66372c404923b21f5ee37dfcacd7347ec --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/parta2.py @@ -0,0 +1,201 @@ +# model settings +voxel_size = [0.05, 0.05, 0.1] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] + +model = dict( + type='PartA2', + voxel_layer=dict( + max_num_points=5, # max_points_per_voxel + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(16000, 40000) # (training, testing) max_voxels + ), + voxel_encoder=dict(type='HardSimpleVFE'), + middle_encoder=dict( + type='SparseUNet', + in_channels=4, + sparse_shape=[41, 1600, 1408], + order=('conv', 'norm', 'act')), + backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + out_channels=[128, 256]), + neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + out_channels=[256, 256]), + rpn_head=dict( + type='PartA2RPNHead', + num_classes=3, + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -1.78, 70.4, 40.0, -1.78]], + sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + assigner_per_size=True, + assign_per_class=True, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + roi_head=dict( + type='PartAggregationROIHead', + num_classes=3, + semantic_head=dict( + type='PointwiseSemanticHead', + in_channels=16, + extra_width=0.2, + seg_score_thr=0.3, + num_classes=3, + loss_seg=dict( + type='FocalLoss', + use_sigmoid=True, + reduction='sum', + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_part=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + seg_roi_extractor=dict( + type='Single3DRoIAwareExtractor', + roi_layer=dict( + type='RoIAwarePool3d', + out_size=14, + max_pts_per_voxel=128, + mode='max')), + part_roi_extractor=dict( + type='Single3DRoIAwareExtractor', + roi_layer=dict( + type='RoIAwarePool3d', + out_size=14, + max_pts_per_voxel=128, + mode='avg')), + bbox_head=dict( + type='PartA2BboxHead', + num_classes=3, + seg_in_channels=16, + part_in_channels=4, + seg_conv_channels=[64, 64], + part_conv_channels=[64, 64], + merge_conv_channels=[128, 128], + down_conv_channels=[128, 256], + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + shared_fc_channels=[256, 512, 512, 512], + cls_channels=[256, 256], + reg_channels=[256, 256], + dropout_ratio=0.1, + roi_feat_size=14, + with_corner_loss=True, + loss_bbox=dict( + type='SmoothL1Loss', + beta=1.0 / 9.0, + reduction='sum', + loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1) + ], + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=9000, + nms_post=512, + max_num=512, + nms_thr=0.8, + score_thr=0, + use_rotate_nms=False), + rcnn=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1) + ], + sampler=dict( + type='IoUNegPiecewiseSampler', + num=128, + pos_fraction=0.55, + neg_piece_fractions=[0.8, 0.2], + neg_iou_piece_thrs=[0.55, 0.1], + neg_pos_ub=-1, + add_gt_as_proposals=False, + return_iou=True), + cls_pos_thr=0.75, + cls_neg_thr=0.25)), + test_cfg=dict( + rpn=dict( + nms_pre=1024, + nms_post=100, + max_num=100, + nms_thr=0.7, + score_thr=0, + use_rotate_nms=True), + rcnn=dict( + use_rotate_nms=True, + use_raw_score=True, + nms_thr=0.01, + score_thr=0.1))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/pointnet2_msg.py b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_msg.py new file mode 100644 index 0000000000000000000000000000000000000000..222ab885557984125eb52a934f443870e6c6918d --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_msg.py @@ -0,0 +1,28 @@ +_base_ = './pointnet2_ssg.py' + +# model settings +model = dict( + backbone=dict( + _delete_=True, + type='PointNet2SAMSG', + in_channels=6, # [xyz, rgb], should be modified with dataset + num_points=(1024, 256, 64, 16), + radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)), + num_samples=((16, 32), (16, 32), (16, 32), (16, 32)), + sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96, + 128)), + ((128, 196, 256), (128, 196, 256)), ((256, 256, 512), + (256, 384, 512))), + aggregation_channels=(None, None, None, None), + fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')), + fps_sample_range_lists=((-1), (-1), (-1), (-1)), + dilated_group=(False, False, False, False), + out_indices=(0, 1, 2, 3), + sa_cfg=dict( + type='PointSAModuleMSG', + pool_mod='max', + use_xyz=True, + normalize_xyz=False)), + decode_head=dict( + fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128), + (128, 128, 128, 128)))) diff --git a/model_examples/MapTR/projects/configs/_base_/models/pointnet2_ssg.py b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_ssg.py new file mode 100644 index 0000000000000000000000000000000000000000..58b4c243ded042612abb1c15c9c175f5e932af38 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_ssg.py @@ -0,0 +1,35 @@ +# model settings +model = dict( + type='EncoderDecoder3D', + backbone=dict( + type='PointNet2SASSG', + in_channels=6, # [xyz, rgb], should be modified with dataset + num_points=(1024, 256, 64, 16), + radius=(0.1, 0.2, 0.4, 0.8), + num_samples=(32, 32, 32, 32), + sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, + 512)), + fp_channels=(), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=False)), + decode_head=dict( + type='PointNet2Head', + fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), + (128, 128, 128, 128)), + channels=128, + dropout_ratio=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'), + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + class_weight=None, # should be modified with dataset + loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide')) diff --git a/model_examples/MapTR/projects/configs/_base_/models/votenet.py b/model_examples/MapTR/projects/configs/_base_/models/votenet.py new file mode 100644 index 0000000000000000000000000000000000000000..129339dc9eaa3f74c0547a39fa527c14be03743c --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/models/votenet.py @@ -0,0 +1,73 @@ +model = dict( + type='VoteNet', + backbone=dict( + type='PointNet2SASSG', + in_channels=4, + num_points=(2048, 1024, 512, 256), + radius=(0.2, 0.4, 0.8, 1.2), + num_samples=(64, 32, 16, 16), + sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), + (128, 128, 256)), + fp_channels=((256, 256), (256, 256)), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True)), + bbox_head=dict( + type='VoteHead', + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=3, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=256, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + pred_layer_cfg=dict( + in_channels=128, shared_conv_channels=(128, 128), bias=True), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='sum', + loss_weight=5.0), + center_loss=dict( + type='ChamferDistance', + mode='l2', + reduction='sum', + loss_src_weight=10.0, + loss_dst_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), + test_cfg=dict( + sample_mod='seed', + nms_thr=0.25, + score_thr=0.05, + per_class_proposal=True)) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/cosine.py b/model_examples/MapTR/projects/configs/_base_/schedules/cosine.py new file mode 100644 index 0000000000000000000000000000000000000000..69cb7df87d23846ea7b64fb6d882679e315e55cf --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/cosine.py @@ -0,0 +1,20 @@ +# This schedule is mainly used by models with dynamic voxelization +# optimizer +lr = 0.003 # max learning rate +optimizer = dict( + type='AdamW', + lr=lr, + betas=(0.95, 0.99), # the momentum is change during training + weight_decay=0.001) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) + +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 10, + min_lr_ratio=1e-5) + +momentum_config = None + +runner = dict(type='EpochBasedRunner', max_epochs=40) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_20e.py b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_20e.py new file mode 100644 index 0000000000000000000000000000000000000000..704740ee5676515213fd30839f5e116c0b4ebfc7 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_20e.py @@ -0,0 +1,24 @@ +# For nuScenes dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 20. Please change the interval accordingly if you do not +# use a default schedule. +# optimizer +# This schedule is mainly used by models on nuScenes dataset +optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01) +# max_norm=10 is better for SECOND +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='cyclic', + target_ratio=(10, 1e-4), + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=(0.85 / 0.95, 1), + cyclic_times=1, + step_ratio_up=0.4, +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_40e.py b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_40e.py new file mode 100644 index 0000000000000000000000000000000000000000..4a711acf4f31cca94ea7a10d035282a45f648c9c --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_40e.py @@ -0,0 +1,31 @@ +# The schedule is usually used by models trained on KITTI dataset + +# The learning rate set in the cyclic schedule is the initial learning rate +# rather than the max learning rate. Since the target_ratio is (10, 1e-4), +# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4 +lr = 0.0018 +# The optimizer follows the setting in SECOND.Pytorch, but here we use +# the offcial AdamW optimizer implemented by PyTorch. +optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) +# We use cyclic learning rate and momentum schedule following SECOND.Pytorch +# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa +# We implement them in mmcv, for more details, please refer to +# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa +# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa +lr_config = dict( + policy='cyclic', + target_ratio=(10, 1e-4), + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=(0.85 / 0.95, 1), + cyclic_times=1, + step_ratio_up=0.4, +) +# Although the max_epochs is 40, this schedule is usually used we +# RepeatDataset with repeat ratio N, thus the actual max epoch +# number could be Nx40 +runner = dict(type='EpochBasedRunner', max_epochs=40) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/mmdet_schedule_1x.py b/model_examples/MapTR/projects/configs/_base_/schedules/mmdet_schedule_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..13b3783cbbe93b6c32bc415dc50f633dffa4aec7 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/mmdet_schedule_1x.py @@ -0,0 +1,11 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/schedule_2x.py b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_2x.py new file mode 100644 index 0000000000000000000000000000000000000000..afde799d9de1e9c03587b54458938b63b1f7de41 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_2x.py @@ -0,0 +1,14 @@ +# optimizer +# This schedule is mainly used by models on nuScenes dataset +optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01) +# max_norm=10 is better for SECOND +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[20, 23]) +momentum_config = None +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/schedule_3x.py b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_3x.py new file mode 100644 index 0000000000000000000000000000000000000000..115cd26b760e749b3ccdd50a6f4d201ea38f824e --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_3x.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used by models on indoor dataset, +# e.g., VoteNet on SUNRGBD and ScanNet +lr = 0.008 # max learning rate +optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) +lr_config = dict(policy='step', warmup=None, step=[24, 32]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py new file mode 100644 index 0000000000000000000000000000000000000000..04b44e51de071dc9158e31fe7c51420326f0493c --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used on S3DIS dataset in segmentation task +optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002) +momentum_config = None + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=150) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py new file mode 100644 index 0000000000000000000000000000000000000000..6a49484c8b37d3c44b7a2979a3173af6a407b967 --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used on ScanNet dataset in segmentation task +optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) +momentum_config = None + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py new file mode 100644 index 0000000000000000000000000000000000000000..975a8f9ff8e5140b0f1707490c282998666c71ef --- /dev/null +++ b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used on S3DIS dataset in segmentation task +optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) +momentum_config = None + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=50) diff --git a/model_examples/MapTR/projects/configs/bevformer/bevformer_base.py b/model_examples/MapTR/projects/configs/bevformer/bevformer_base.py new file mode 100644 index 0000000000000000000000000000000000000000..78873d446f8e5f957f89126a2426c5508c7b38a7 --- /dev/null +++ b/model_examples/MapTR/projects/configs/bevformer/bevformer_base.py @@ -0,0 +1,257 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 4 +bev_h_ = 200 +bev_w_ = 200 +queue_length = 4 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer', + use_grid_mask=True, + video_test_mode=True, + img_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN2d', requires_grad=False), + norm_eval=True, + style='caffe', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict + stage_with_dcn=(False, False, True, True)), + img_neck=dict( + type='FPN', + in_channels=[512, 1024, 2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=4, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=6, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/bevformer/bevformer_small.py b/model_examples/MapTR/projects/configs/bevformer/bevformer_small.py new file mode 100644 index 0000000000000000000000000000000000000000..6856e7cd46f3c22fbcb2d9e6364ffe1288b8ff3f --- /dev/null +++ b/model_examples/MapTR/projects/configs/bevformer/bevformer_small.py @@ -0,0 +1,268 @@ +# BEvFormer-small consumes at lease 10500M GPU memory +# compared to bevformer_base, bevformer_small has +# smaller BEV: 200*200 -> 150*150 +# less encoder layers: 6 -> 3 +# smaller input size: 1600*900 -> (1600*900)*0.8 +# multi-scale feautres -> single scale features (C5) +# with_cp of backbone = True + +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +bev_h_ = 150 +bev_w_ = 150 +queue_length = 3 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer', + use_grid_mask=True, + video_test_mode=True, + img_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN2d', requires_grad=False), + norm_eval=True, + style='caffe', + with_cp=True, # using checkpoint to save GPU memory + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict + stage_with_dcn=(False, False, True, True)), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=3, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.8]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + # dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.8]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/bevformer/bevformer_tiny.py b/model_examples/MapTR/projects/configs/bevformer/bevformer_tiny.py new file mode 100644 index 0000000000000000000000000000000000000000..f56d1b2f4dcb6e80440d5308b52047109959ccaa --- /dev/null +++ b/model_examples/MapTR/projects/configs/bevformer/bevformer_tiny.py @@ -0,0 +1,270 @@ +# BEvFormer-tiny consumes at lease 6700M GPU memory +# compared to bevformer_base, bevformer_tiny has +# smaller backbone: R101-DCN -> R50 +# smaller BEV: 200*200 -> 50*50 +# less encoder layers: 6 -> 3 +# smaller input size: 1600*900 -> 800*450 +# multi-scale feautres -> single scale features (C5) + + +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +bev_h_ = 50 +bev_w_ = 50 +queue_length = 3 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer', + use_grid_mask=True, + video_test_mode=True, + pretrained=dict(img='torchvision://resnet50'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=3, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py b/model_examples/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..af0eb269048ba786cddb18f8d01a188681ac09c4 --- /dev/null +++ b/model_examples/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py @@ -0,0 +1,272 @@ +# BEvFormer-tiny consumes at lease 6700M GPU memory +# compared to bevformer_base, bevformer_tiny has +# smaller backbone: R101-DCN -> R50 +# smaller BEV: 200*200 -> 50*50 +# less encoder layers: 6 -> 3 +# smaller input size: 1600*900 -> 800*450 +# multi-scale feautres -> single scale features (C5) + + +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +bev_h_ = 50 +bev_w_ = 50 +queue_length = 3 # each sequence contains `queue_length` frames. + +model = dict( + type='BEVFormer_fp16', + use_grid_mask=True, + video_test_mode=True, + pretrained=dict(img='torchvision://resnet50'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_classes=10, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='PerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=3, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=8, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2.8e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner_video', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) +custom_hooks = [dict(type='TransferWeight',priority='LOWEST')] \ No newline at end of file diff --git a/model_examples/MapTR/projects/configs/datasets/custom_lyft-3d.py b/model_examples/MapTR/projects/configs/datasets/custom_lyft-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..5a95d898c91e463b731a08f7c52b8186e99da83a --- /dev/null +++ b/model_examples/MapTR/projects/configs/datasets/custom_lyft-3d.py @@ -0,0 +1,136 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-80, -80, -5, 80, 80, 3] +# For Lyft we usually do 9-class detection +class_names = [ + 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', + 'bicycle', 'pedestrian', 'animal' +] +dataset_type = 'CustomLyftDataset' +data_root = 'data/lyft/' +# Input modality for Lyft dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=True) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/lyft/': 's3://lyft/lyft/', +# 'data/lyft/': 's3://lyft/lyft/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True)) +# For Lyft dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) \ No newline at end of file diff --git a/model_examples/MapTR/projects/configs/datasets/custom_nus-3d.py b/model_examples/MapTR/projects/configs/datasets/custom_nus-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..af81f9b20d182222d0b69fc26fe32c1e66905a16 --- /dev/null +++ b/model_examples/MapTR/projects/configs/datasets/custom_nus-3d.py @@ -0,0 +1,141 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-50, -50, -5, 50, 50, 3] +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +dataset_type = 'NuScenesDataset_eval_modified' +data_root = 'data/nuscenes/' +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/nuscenes/': 's3://nuscenes/nuscenes/', +# 'data/nuscenes/': 's3://nuscenes/nuscenes/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR')) +# For nuScenes dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/model_examples/MapTR/projects/configs/datasets/custom_waymo-3d.py b/model_examples/MapTR/projects/configs/datasets/custom_waymo-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..4100e13546badb06e69fd0b1ed20158de8acf893 --- /dev/null +++ b/model_examples/MapTR/projects/configs/datasets/custom_waymo-3d.py @@ -0,0 +1,112 @@ +# dataset settings +# D5 in the config name means the whole dataset is divided into 5 folds +# We only use one fold for efficient experiments +dataset_type = 'CustomWaymoDataset' +data_root = 'data/waymo/kitti_format/' +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://waymo_data/')) + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +class_names = ['Car', 'Pedestrian', 'Cyclist'] +point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] +input_modality = dict(use_lidar=False, use_camera=True) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'waymo_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), + classes=class_names, + sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1920, 1280), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + + +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) + +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_train.pkl', + split='training', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + # load one frame every five frames + load_interval=5)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=24, pipeline=test_pipeline) \ No newline at end of file diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py b/model_examples/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py new file mode 100644 index 0000000000000000000000000000000000000000..aba45e30a9a2f40de1ca6137e8f845dd87edaa3e --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py @@ -0,0 +1,312 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +voxel_size = [0.15, 0.15, 4] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 80 +bev_w_ = 40 +queue_length = 1 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet18-f37072fd.pth'), + img_backbone=dict( + type='ResNet', + depth=18, + num_stages=4, + out_indices=(3,), + frozen_stages=-1, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[512], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=100, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='GeometrySptialCrossAttention', + pc_range=point_cloud_range, + attention=dict( + type='GeometryKernelAttention', + embed_dims=_dim_, + num_heads=4, + dilation=1, + kernel_size=(3,5), + num_levels=_num_levels_, + im2col_step=192), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=2, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=4, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1, + im2col_step=192), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.2]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.2]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=24, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=4e-3, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=50, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 110 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=5) diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py new file mode 100644 index 0000000000000000000000000000000000000000..b1fdb8c6e20f3b91ce1eaf65edd348a7df16f492 --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py @@ -0,0 +1,342 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +lidar_point_cloud_range = [-15.0, -30.0, -5.0, 15.0, 30.0, 3.0] +voxel_size = [0.1, 0.1, 0.2] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=True, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 200 +bev_w_ = 100 +queue_length = 1 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + modality='fusion', + lidar_encoder=dict( + voxelize=dict(max_num_points=10,point_cloud_range=lidar_point_cloud_range,voxel_size=voxel_size,max_voxels=[90000, 120000]), + backbone=dict( + type='SparseEncoder', + in_channels=5, + sparse_shape=[300, 600, 41], + output_channels=128, + order=('conv', 'norm', 'act'), + encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, + 128)), + encoder_paddings=([0, 0, 1], [0, 0, 1], [0, 0, [1, 1, 0]], [0, 0]), + block_type='basicblock' + ), + ), + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + modality='fusion', + fuser=dict( + type='ConvFuser', + in_channels=[_dim_, 256], + out_channels=_dim_, + ), + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='GeometrySptialCrossAttention', + pc_range=point_cloud_range, + attention=dict( + type='GeometryKernelAttention', + embed_dims=_dim_, + num_heads=4, + dilation=1, + kernel_size=(3,5), + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + +reduce_beams=32 +load_dim=5 +use_dim=5 + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='CustomLoadPointsFromFile', coord_type='LIDAR', load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams), + dict(type='CustomLoadPointsFromMultiSweeps', sweeps_num=9, load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams, pad_empty_sweeps=True, remove_close=True), + dict(type='CustomPointsRangeFilter', point_cloud_range=lidar_point_cloud_range), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'points']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='CustomLoadPointsFromFile', coord_type='LIDAR', load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams), + dict(type='CustomLoadPointsFromMultiSweeps', sweeps_num=9, load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams, pad_empty_sweeps=True, remove_close=True), + dict(type='CustomPointsRangeFilter', point_cloud_range=lidar_point_cloud_range), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img', 'points']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) +find_unused_parameters=True + diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py new file mode 100644 index 0000000000000000000000000000000000000000..9457c677aca7d378d089f04c3003658ff2af08d6 --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py @@ -0,0 +1,310 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +voxel_size = [0.15, 0.15, 4] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 200 +bev_w_ = 100 +queue_length = 1 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='GeometrySptialCrossAttention', + pc_range=point_cloud_range, + attention=dict( + type='GeometryKernelAttention', + embed_dims=_dim_, + num_heads=4, + dilation=1, + kernel_size=(3,5), + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 110 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=5) diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py new file mode 100644 index 0000000000000000000000000000000000000000..5fcded904a2d7b72e7d9e283f6cd6d2e39efa591 --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py @@ -0,0 +1,310 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +voxel_size = [0.15, 0.15, 4] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 200 +bev_w_ = 100 +queue_length = 1 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='GeometrySptialCrossAttention', + pc_range=point_cloud_range, + attention=dict( + type='GeometryKernelAttention', + embed_dims=_dim_, + num_heads=4, + dilation=1, + kernel_size=(3,5), + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py new file mode 100644 index 0000000000000000000000000000000000000000..12976bdb5a2c1118f89fb2a31d22806dc3c1a16a --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py @@ -0,0 +1,308 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +voxel_size = [0.15, 0.15, 4] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 200 +bev_w_ = 100 +queue_length = 1 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v5', #3 + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer_t4.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer_t4.py new file mode 100644 index 0000000000000000000000000000000000000000..fa65a4cbba579ac465473169b6e442cdd97b32b5 --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer_t4.py @@ -0,0 +1,309 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +voxel_size = [0.15, 0.15, 4] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 200 +bev_w_ = 100 +queue_length = 4 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + len_can_bus=6, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py new file mode 100644 index 0000000000000000000000000000000000000000..189d67f06f7aaddc3c7523324b2f4f4ff8e52017 --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py @@ -0,0 +1,290 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0,-10.0, 15.0, 30.0, 10.0] +voxel_size = [0.15, 0.15, 20.0] +dbound=[1.0, 35.0, 0.5] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 200 +bev_w_ = 100 +queue_length = 1 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='LSSTransform', + in_channels=_dim_, + out_channels=_dim_, + feat_down_sample=32, + pc_range=point_cloud_range, + voxel_size=voxel_size, + dbound=dbound, + downsample=2), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_t4.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_t4.py new file mode 100644 index 0000000000000000000000000000000000000000..fd92fc86b26bfe4c08a77b69ff66c0ef93b818bc --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_t4.py @@ -0,0 +1,308 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +voxel_size = [0.15, 0.15, 4] + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 200 +bev_w_ = 100 +queue_length = 4 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + len_can_bus=6, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='GeometrySptialCrossAttention', + pc_range=point_cloud_range, + attention=dict( + type='GeometryKernelAttention', + embed_dims=_dim_, + num_heads=4, + dilation=1, + kernel_size=(3,5), + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomNuScenesLocalMapDataset' +data_root = 'data/nuscenes/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.5]), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py new file mode 100644 index 0000000000000000000000000000000000000000..8bb0958574d19aacd6232a60433cb68197eb9853 --- /dev/null +++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py @@ -0,0 +1,315 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +point_cloud_range = [-30.0, -15.0, -2.0, 30.0, 15.0, 2.0] +voxel_size = [0.15, 0.15, 4] + + + + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing','boundary'] +# fixed_ptsnum_per_line = 20 +# map_classes = ['divider',] +fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +fixed_ptsnum_per_pred_line = 20 +eval_use_same_gt_sample_num_flag=True +num_map_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +# bev_h_ = 50 +# bev_w_ = 50 +bev_h_ = 100 +bev_w_ = 200 +queue_length = 1 # each sequence contains `queue_length` frames. + +model = dict( + type='MapTR', + use_grid_mask=True, + video_test_mode=False, + pretrained=dict(img='ckpts/resnet50-19c8e357.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='MapTRHead', + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, + num_vec=50, + num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox + num_pts_per_gt_vec=fixed_ptsnum_per_gt_line, + dir_interval=1, + query_embed_type='instance_pts', + transform_method='minmax', + gt_shift_pts_pattern='v2', + num_classes=num_map_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_size=2, + code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='MapTRPerceptionTransformer', + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + num_cams=7, + encoder=dict( + type='BEVFormerEncoder', + num_layers=1, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='GeometrySptialCrossAttention', + pc_range=point_cloud_range, + num_cams=7, + attention=dict( + type='GeometryKernelAttention', + embed_dims=_dim_, + num_heads=4, + dilation=1, + kernel_size=(3,5), + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='MapTRDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='MapTRNMSFreeCoder', + # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + post_center_range=[-35, -20, -35, -20, 35, 20, 35, 20], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=num_map_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_pts=dict(type='PtsL1Loss', + loss_weight=5.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='MapTRAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + # reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', + weight=5), + pc_range=point_cloud_range)))) + +dataset_type = 'CustomAV2LocalMapDataset' +data_root = 'data/argoverse2/sensor/' +file_client_args = dict(backend='disk') + + +train_pipeline = [ + dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True, padding=True), + dict(type='RandomScaleImageMultiViewImage', scales=[0.3]), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['img']) +] + +test_pipeline = [ + dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True, padding=True), + dict(type='RandomScaleImageMultiViewImage', scales=[0.3]), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + + dict( + type='MultiScaleFlipAug3D', + img_scale=(2048, 2048), # 2048*0.3, 2048*0.3 + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=8, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'av2_map_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + queue_length=queue_length, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'av2_map_infos_val.pkl', + # ann_file=data_root + 'av2_map_infos_train.pkl', + map_ann_file=data_root + 'av2_map_anns_val.json', + load_interval=4, # av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes, + # load_interval=1, # TODO debug + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality, samples_per_gpu=1), + test=dict(type=dataset_type, + data_root=data_root, + ann_file=data_root + 'av2_map_infos_val.pkl', + # ann_file=data_root + 'av2_map_infos_train.pkl', + map_ann_file=data_root + 'av2_map_anns_val.json', + load_interval=4, # av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes, + # load_interval=1, # TODO debug + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag, + padding_value=-10000, + map_classes=map_classes, + classes=class_names, modality=input_modality), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=6e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +# total_epochs = 50 +# evaluation = dict(interval=1, pipeline=test_pipeline) +evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +fp16 = dict(loss_scale=512.) +checkpoint_config = dict(interval=1) diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53e32add3335e5f1f7e007457ff740bfcf1c91d5 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/__init__.py @@ -0,0 +1,13 @@ +from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D +from .core.bbox.coders.nms_free_coder import NMSFreeCoder +from .core.bbox.match_costs import BBox3DL1Cost +from .core.evaluation.eval_hooks import CustomDistEvalHook +from .datasets.pipelines import ( + PhotoMetricDistortionMultiViewImage, PadMultiViewImage, + NormalizeMultiviewImage, CustomCollect3D) +from .models.backbones.vovnet import VoVNet +from .models.utils import * +from .models.opt.adamw import AdamW2 +from .bevformer import * +from .maptr import * +from .models.backbones.efficientnet import EfficientNet \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..27df18f846ee77954d3aa11c4c4613aabeea06d4 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py @@ -0,0 +1,6 @@ + +from .dense_heads import * +from .detectors import * +from .modules import * +from .runner import * +from .hooks import * diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..15dff22b7478a0f30151d376d41f3dc46e88ba7d --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py @@ -0,0 +1,3 @@ +from .train import custom_train_model +from .mmdet_train import custom_train_detector +# from .test import custom_multi_gpu_test \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py new file mode 100644 index 0000000000000000000000000000000000000000..e57bd225dc33d631849a3aef8db2bae217520658 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py @@ -0,0 +1,200 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import random +import warnings + +import numpy as np +import torch +import torch.distributed as dist +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, + Fp16OptimizerHook, OptimizerHook, build_optimizer, + build_runner, get_dist_info) +from mmcv.utils import build_from_cfg + +from mmdet.core import EvalHook + +from mmdet.datasets import (build_dataset, + replace_ImageToTensor) +from mmdet.utils import get_root_logger +import time +import os.path as osp +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook +from projects.mmdet3d_plugin.datasets import custom_build_dataset +def custom_train_detector(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + eval_model=None, + meta=None): + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + #assert len(dataset)==1s + if 'imgs_per_gpu' in cfg.data: + logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' + 'Please use "samples_per_gpu" instead') + if 'samples_per_gpu' in cfg.data: + logger.warning( + f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' + f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' + f'={cfg.data.imgs_per_gpu} is used in this experiments') + else: + logger.warning( + 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' + f'{cfg.data.imgs_per_gpu} in this experiments') + cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu + + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + # cfg.gpus will be ignored if distributed + len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), + nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), + ) for ds in dataset + ] + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + if eval_model is not None: + eval_model = MMDistributedDataParallel( + eval_model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDataParallel( + model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + if eval_model is not None: + eval_model = MMDataParallel( + eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + + if 'runner' not in cfg: + cfg.runner = { + 'type': 'EpochBasedRunner', + 'max_epochs': cfg.total_epochs + } + warnings.warn( + 'config is now expected to have a `runner` section, ' + 'please set `runner` in your config.', UserWarning) + else: + if 'total_epochs' in cfg: + assert cfg.total_epochs == cfg.runner.max_epochs + if eval_model is not None: + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + eval_model=eval_model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + else: + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed) + elif distributed and 'type' not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + + # register profiler hook + #trace_config = dict(type='tb_trace', dir_name='work_dir') + #profiler_config = dict(on_trace_ready=trace_config) + #runner.register_profiler_hook(profiler_config) + + if distributed: + if isinstance(runner, EpochBasedRunner): + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + # Support batch_size > 1 in validation + val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) + if val_samples_per_gpu > 1: + assert False + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.val.pipeline = replace_ImageToTensor( + cfg.data.val.pipeline) + val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True)) + + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=val_samples_per_gpu, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False, + shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), + nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), + ) + eval_cfg = cfg.get('evaluation', {}) + eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' + eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_')) + eval_hook = CustomDistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + + # user-defined hooks + if cfg.get('custom_hooks', None): + custom_hooks = cfg.custom_hooks + assert isinstance(custom_hooks, list), \ + f'custom_hooks expect list type, but got {type(custom_hooks)}' + for hook_cfg in cfg.custom_hooks: + assert isinstance(hook_cfg, dict), \ + 'Each item in custom_hooks expects dict type, but got ' \ + f'{type(hook_cfg)}' + hook_cfg = hook_cfg.copy() + priority = hook_cfg.pop('priority', 'NORMAL') + hook = build_from_cfg(hook_cfg, HOOKS) + runner.register_hook(hook, priority=priority) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow) + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py new file mode 100644 index 0000000000000000000000000000000000000000..03acb14afe79cbec4f0463cd4865fcc7dd07a50f --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py @@ -0,0 +1,164 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import os.path as osp +import pickle +import shutil +import tempfile +import time + +import mmcv +import torch +import torch.distributed as dist +from mmcv.image import tensor2imgs +from mmcv.runner import get_dist_info + +from mmdet.core import encode_mask_results + + +import mmcv +import numpy as np +import pycocotools.mask as mask_util + +def custom_encode_mask_results(mask_results): + """Encode bitmap mask to RLE code. Semantic Masks only + Args: + mask_results (list | tuple[list]): bitmap mask results. + In mask scoring rcnn, mask_results is a tuple of (segm_results, + segm_cls_score). + Returns: + list | tuple: RLE encoded mask. + """ + cls_segms = mask_results + num_classes = len(cls_segms) + encoded_mask_results = [] + for i in range(len(cls_segms)): + encoded_mask_results.append( + mask_util.encode( + np.array( + cls_segms[i][:, :, np.newaxis], order='F', + dtype='uint8'))[0]) # encoded with RLE + return [encoded_mask_results] + +def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + Returns: + list: The prediction results. + """ + model.eval() + bbox_results = [] + mask_results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + time.sleep(2) # This line can prevent deadlock problem in some cases. + have_mask = False + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + # encode mask results + if isinstance(result, dict): + if 'bbox_results' in result.keys(): + bbox_result = result['bbox_results'] + batch_size = len(result['bbox_results']) + bbox_results.extend(bbox_result) + if 'mask_results' in result.keys() and result['mask_results'] is not None: + mask_result = custom_encode_mask_results(result['mask_results']) + mask_results.extend(mask_result) + have_mask = True + else: + batch_size = len(result) + bbox_results.extend(result) + + #if isinstance(result[0], tuple): + # assert False, 'this code is for instance segmentation, which our code will not utilize.' + # result = [(bbox_results, encode_mask_results(mask_results)) + # for bbox_results, mask_results in result] + if rank == 0: + + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + bbox_results = collect_results_gpu(bbox_results, len(dataset)) + if have_mask: + mask_results = collect_results_gpu(mask_results, len(dataset)) + else: + mask_results = None + else: + bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir) + tmpdir = tmpdir+'_mask' if tmpdir is not None else None + if have_mask: + mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir) + else: + mask_results = None + + if mask_results is None: + return bbox_results + return {'bbox_results': bbox_results, 'mask_results': mask_results} + + +def collect_results_cpu(result_part, size, tmpdir=None): + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + ''' + bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample, + ''' + #for res in zip(*part_list): + for res in part_list: + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + collect_results_cpu(result_part, size) \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f9391e606f29961875b48eebe36d3b9d415b6290 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py @@ -0,0 +1,67 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from .mmdet_train import custom_train_detector +from mmseg.apis import train_segmentor +from mmdet.apis import train_detector + +def custom_train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + eval_model=None, + meta=None): + """A function wrapper for launching model training according to cfg. + + Because we need different eval_hook in runner. Should be deprecated in the + future. + """ + if cfg.model.type in ['EncoderDecoder3D']: + assert False + else: + custom_train_detector( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + eval_model=eval_model, + meta=meta) + + +def train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + """A function wrapper for launching model training according to cfg. + + Because we need different eval_hook in runner. Should be deprecated in the + future. + """ + if cfg.model.type in ['EncoderDecoder3D']: + train_segmentor( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + meta=meta) + else: + train_detector( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + meta=meta) diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6823adfb593d67f27af4af2207a515af4cbab6f5 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py @@ -0,0 +1 @@ +from .bevformer_head import BEVFormerHead \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py new file mode 100644 index 0000000000000000000000000000000000000000..15691fd9040c07fad2c04991d398ff586996597c --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py @@ -0,0 +1,523 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Linear, bias_init_with_prob +from mmcv.utils import TORCH_VERSION, digit_version + +from mmdet.core import (multi_apply, multi_apply, reduce_mean) +from mmdet.models.utils.transformer import inverse_sigmoid +from mmdet.models import HEADS +from mmdet.models.dense_heads import DETRHead +from mmdet3d.core.bbox.coders import build_bbox_coder +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +from mmcv.cnn.bricks.transformer import build_positional_encoding +from mmcv.runner import force_fp32, auto_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +import numpy as np +import mmcv +import cv2 as cv +from projects.mmdet3d_plugin.models.utils.visual import save_tensor + + +@HEADS.register_module() +class BEVFormerHead(DETRHead): + """Head of Detr3D. + Args: + with_box_refine (bool): Whether to refine the reference points + in the decoder. Defaults to False. + as_two_stage (bool) : Whether to generate the proposal from + the outputs of encoder. + transformer (obj:`ConfigDict`): ConfigDict is used for building + the Encoder and Decoder. + bev_h, bev_w (int): spatial shape of BEV queries. + """ + + def __init__(self, + *args, + with_box_refine=False, + as_two_stage=False, + transformer=None, + bbox_coder=None, + num_cls_fcs=2, + code_weights=None, + bev_h=30, + bev_w=30, + **kwargs): + + self.bev_h = bev_h + self.bev_w = bev_w + self.fp16_enabled = False + + self.with_box_refine = with_box_refine + self.as_two_stage = as_two_stage + if self.as_two_stage: + transformer['as_two_stage'] = self.as_two_stage + if 'code_size' in kwargs: + self.code_size = kwargs['code_size'] + else: + self.code_size = 10 + if code_weights is not None: + self.code_weights = code_weights + else: + self.code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + + self.bbox_coder = build_bbox_coder(bbox_coder) + self.pc_range = self.bbox_coder.pc_range + self.real_w = self.pc_range[3] - self.pc_range[0] + self.real_h = self.pc_range[4] - self.pc_range[1] + self.num_cls_fcs = num_cls_fcs - 1 + super(BEVFormerHead, self).__init__( + *args, transformer=transformer, **kwargs) + self.code_weights = nn.Parameter(torch.tensor( + self.code_weights, requires_grad=False), requires_grad=False) + + def _init_layers(self): + """Initialize classification branch and regression branch of head.""" + cls_branch = [] + for _ in range(self.num_reg_fcs): + cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + cls_branch.append(nn.LayerNorm(self.embed_dims)) + cls_branch.append(nn.ReLU(inplace=True)) + cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) + fc_cls = nn.Sequential(*cls_branch) + + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.code_size)) + reg_branch = nn.Sequential(*reg_branch) + + def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + # last reg_branch is used to generate proposal from + # encode feature map when as_two_stage is True. + num_pred = (self.transformer.decoder.num_layers + 1) if \ + self.as_two_stage else self.transformer.decoder.num_layers + + if self.with_box_refine: + self.cls_branches = _get_clones(fc_cls, num_pred) + self.reg_branches = _get_clones(reg_branch, num_pred) + else: + self.cls_branches = nn.ModuleList( + [fc_cls for _ in range(num_pred)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(num_pred)]) + + if not self.as_two_stage: + self.bev_embedding = nn.Embedding( + self.bev_h * self.bev_w, self.embed_dims) + self.query_embedding = nn.Embedding(self.num_query, + self.embed_dims * 2) + + def init_weights(self): + """Initialize weights of the DeformDETR head.""" + self.transformer.init_weights() + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + + @auto_fp16(apply_to=('mlvl_feats')) + def forward(self, mlvl_feats, img_metas, prev_bev=None, only_bev=False): + """Forward function. + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 5D-tensor with shape + (B, N, C, H, W). + prev_bev: previous bev featues + only_bev: only compute BEV features with encoder. + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + """ + + bs, num_cam, _, _, _ = mlvl_feats[0].shape + dtype = mlvl_feats[0].dtype + object_query_embeds = self.query_embedding.weight.to(dtype) + bev_queries = self.bev_embedding.weight.to(dtype) + + bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), + device=bev_queries.device).to(dtype) + bev_pos = self.positional_encoding(bev_mask).to(dtype) + + if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround + return self.transformer.get_bev_features( + mlvl_feats, + bev_queries, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + img_metas=img_metas, + prev_bev=prev_bev, + ) + else: + outputs = self.transformer( + mlvl_feats, + bev_queries, + object_query_embeds, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + cls_branches=self.cls_branches if self.as_two_stage else None, + img_metas=img_metas, + prev_bev=prev_bev + ) + + bev_embed, hs, init_reference, inter_references = outputs + hs = hs.permute(0, 2, 1, 3) + outputs_classes = [] + outputs_coords = [] + for lvl in range(hs.shape[0]): + if lvl == 0: + reference = init_reference + else: + reference = inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.cls_branches[lvl](hs[lvl]) + tmp = self.reg_branches[lvl](hs[lvl]) + + # TODO: check the shape of reference + assert reference.shape[-1] == 3 + tmp[..., 0:2] += reference[..., 0:2] + tmp[..., 0:2] = tmp[..., 0:2].sigmoid() + tmp[..., 4:5] += reference[..., 2:3] + tmp[..., 4:5] = tmp[..., 4:5].sigmoid() + tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - + self.pc_range[0]) + self.pc_range[0]) + tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - + self.pc_range[1]) + self.pc_range[1]) + tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - + self.pc_range[2]) + self.pc_range[2]) + + # TODO: check if using sigmoid + outputs_coord = tmp + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + + outs = { + 'bev_embed': bev_embed, + 'all_cls_scores': outputs_classes, + 'all_bbox_preds': outputs_coords, + 'enc_cls_scores': None, + 'enc_bbox_preds': None, + } + + return outs + + def _get_target_single(self, + cls_score, + bbox_pred, + gt_labels, + gt_bboxes, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 4]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_c = gt_bboxes.shape[-1] + + assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, + gt_labels, gt_bboxes_ignore) + + sampling_result = self.sampler.sample(assign_result, bbox_pred, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + return (labels, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds) + + def get_targets(self, + cls_scores_list, + bbox_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( + self._get_target_single, cls_scores_list, bbox_preds_list, + gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, num_total_pos, num_total_neg) + + def loss_single(self, + cls_scores, + bbox_preds, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, + gt_bboxes_list, gt_labels_list, + gt_bboxes_ignore_list) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.code_weights + + loss_bbox = self.loss_bbox( + bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, + :10], bbox_weights[isnotnan, :10], + avg_factor=num_total_pos) + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + return loss_cls, loss_bbox + + @force_fp32(apply_to=('preds_dicts')) + def loss(self, + gt_bboxes_list, + gt_labels_list, + preds_dicts, + gt_bboxes_ignore=None, + img_metas=None): + """"Loss function. + Args: + + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + preds_dicts: + all_cls_scores (Tensor): Classification score of all + decoder layers, has shape + [nb_dec, bs, num_query, cls_out_channels]. + all_bbox_preds (Tensor): Sigmoid regression + outputs of all decode layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + [nb_dec, bs, num_query, 4]. + enc_cls_scores (Tensor): Classification scores of + points on encode feature map , has shape + (N, h*w, num_classes). Only be passed when as_two_stage is + True, otherwise is None. + enc_bbox_preds (Tensor): Regression results of each points + on the encode feature map, has shape (N, h*w, 4). Only be + passed when as_two_stage is True, otherwise is None. + gt_bboxes_ignore (list[Tensor], optional): Bounding boxes + which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert gt_bboxes_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + f'for gt_bboxes_ignore setting to None.' + + all_cls_scores = preds_dicts['all_cls_scores'] + all_bbox_preds = preds_dicts['all_bbox_preds'] + enc_cls_scores = preds_dicts['enc_cls_scores'] + enc_bbox_preds = preds_dicts['enc_bbox_preds'] + + num_dec_layers = len(all_cls_scores) + device = gt_labels_list[0].device + + gt_bboxes_list = [torch.cat( + (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), + dim=1).to(device) for gt_bboxes in gt_bboxes_list] + + all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] + all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [ + gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + losses_cls, losses_bbox = multi_apply( + self.loss_single, all_cls_scores, all_bbox_preds, + all_gt_bboxes_list, all_gt_labels_list, + all_gt_bboxes_ignore_list) + + loss_dict = dict() + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + binary_labels_list = [ + torch.zeros_like(gt_labels_list[i]) + for i in range(len(all_gt_labels_list)) + ] + enc_loss_cls, enc_losses_bbox = \ + self.loss_single(enc_cls_scores, enc_bbox_preds, + gt_bboxes_list, binary_labels_list, gt_bboxes_ignore) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], + losses_bbox[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + num_dec_layer += 1 + return loss_dict + + @force_fp32(apply_to=('preds_dicts')) + def get_bboxes(self, preds_dicts, img_metas, rescale=False): + """Generate bboxes from bbox head predictions. + Args: + preds_dicts (tuple[list[dict]]): Prediction results. + img_metas (list[dict]): Point cloud and image's meta info. + Returns: + list[dict]: Decoded bbox, scores and labels after nms. + """ + + preds_dicts = self.bbox_coder.decode(preds_dicts) + + num_samples = len(preds_dicts) + ret_list = [] + for i in range(num_samples): + preds = preds_dicts[i] + bboxes = preds['bboxes'] + + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 + + code_size = bboxes.shape[-1] + bboxes = img_metas[i]['box_type_3d'](bboxes, code_size) + scores = preds['scores'] + labels = preds['labels'] + + ret_list.append([bboxes, scores, labels]) + + return ret_list diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78146f0642d52d604a433ae2eab6866fee23ab3e --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py @@ -0,0 +1,2 @@ +from .bevformer import BEVFormer +from .bevformer_fp16 import BEVFormer_fp16 \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1e0743dc994529dbf0ef0523e28d616cba6236 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py @@ -0,0 +1,289 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from tkinter.messagebox import NO +import torch +from mmcv.runner import force_fp32, auto_fp16 +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +import time +import copy +import numpy as np +import mmdet3d +from projects.mmdet3d_plugin.models.utils.bricks import run_time + + +@DETECTORS.register_module() +class BEVFormer(MVXTwoStageDetector): + """BEVFormer. + Args: + video_test_mode (bool): Decide whether to use temporal information during inference. + """ + + def __init__(self, + use_grid_mask=False, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_roi_head=None, + img_rpn_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + video_test_mode=False + ): + + super(BEVFormer, + self).__init__(pts_voxel_layer, pts_voxel_encoder, + pts_middle_encoder, pts_fusion_layer, + img_backbone, pts_backbone, img_neck, pts_neck, + pts_bbox_head, img_roi_head, img_rpn_head, + train_cfg, test_cfg, pretrained) + self.grid_mask = GridMask( + True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) + self.use_grid_mask = use_grid_mask + self.fp16_enabled = False + + # temporal + self.video_test_mode = video_test_mode + self.prev_frame_info = { + 'prev_bev': None, + 'scene_token': None, + 'prev_pos': 0, + 'prev_angle': 0, + } + + + def extract_img_feat(self, img, img_metas, len_queue=None): + """Extract features of images.""" + B = img.size(0) + if img is not None: + + # input_shape = img.shape[-2:] + # # update real input shape of each single img + # for img_meta in img_metas: + # img_meta.update(input_shape=input_shape) + + if img.dim() == 5 and img.size(0) == 1: + img.squeeze_() + elif img.dim() == 5 and img.size(0) > 1: + B, N, C, H, W = img.size() + img = img.reshape(B * N, C, H, W) + if self.use_grid_mask: + img = self.grid_mask(img) + + img_feats = self.img_backbone(img) + if isinstance(img_feats, dict): + img_feats = list(img_feats.values()) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + img_feats_reshaped = [] + for img_feat in img_feats: + BN, C, H, W = img_feat.size() + if len_queue is not None: + img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W)) + else: + img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) + return img_feats_reshaped + + @auto_fp16(apply_to=('img')) + def extract_feat(self, img, img_metas=None, len_queue=None): + """Extract features from images and points.""" + + img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue) + + return img_feats + + + def forward_pts_train(self, + pts_feats, + gt_bboxes_3d, + gt_labels_3d, + img_metas, + gt_bboxes_ignore=None, + prev_bev=None): + """Forward function' + Args: + pts_feats (list[torch.Tensor]): Features of point cloud branch + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + boxes for each sample. + gt_labels_3d (list[torch.Tensor]): Ground truth labels for + boxes of each sampole + img_metas (list[dict]): Meta information of samples. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + boxes to be ignored. Defaults to None. + prev_bev (torch.Tensor, optional): BEV features of previous frame. + Returns: + dict: Losses of each branch. + """ + + outs = self.pts_bbox_head( + pts_feats, img_metas, prev_bev) + loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] + losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas) + return losses + + def forward_dummy(self, img): + dummy_metas = None + return self.forward_test(img=img, img_metas=[[dummy_metas]]) + + def forward(self, return_loss=True, **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. + Note this setting will change the expected inputs. When + `return_loss=True`, img and img_metas are single-nested (i.e. + torch.Tensor and list[dict]), and when `resturn_loss=False`, img and + img_metas should be double nested (i.e. list[torch.Tensor], + list[list[dict]]), with the outer list indicating test time + augmentations. + """ + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def obtain_history_bev(self, imgs_queue, img_metas_list): + """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated. + """ + self.eval() + + with torch.no_grad(): + prev_bev = None + bs, len_queue, num_cams, C, H, W = imgs_queue.shape + imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W) + img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue) + for i in range(len_queue): + img_metas = [each[i] for each in img_metas_list] + # img_feats = self.extract_feat(img=img, img_metas=img_metas) + img_feats = [each_scale[:, i] for each_scale in img_feats_list] + prev_bev = self.pts_bbox_head( + img_feats, img_metas, prev_bev, only_bev=True) + self.train() + return prev_bev + + @auto_fp16(apply_to=('img', 'points')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + len_queue = img.size(1) + prev_img = img[:, :-1, ...] + img = img[:, -1, ...] + + prev_img_metas = copy.deepcopy(img_metas) + prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) + + img_metas = [each[len_queue-1] for each in img_metas] + img_feats = self.extract_feat(img=img, img_metas=img_metas) + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev) + + losses.update(losses_pts) + return losses + + def forward_test(self, img_metas, img=None, **kwargs): + for var, name in [(img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError('{} must be a list, but got {}'.format( + name, type(var))) + img = [img] if img is None else img + + if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']: + # the first sample of each scene is truncated + self.prev_frame_info['prev_bev'] = None + # update idx + self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token'] + + # do not use temporal information + if not self.video_test_mode: + self.prev_frame_info['prev_bev'] = None + + # Get the delta of ego position and angle between two timestamps. + tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3]) + tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1]) + if self.prev_frame_info['prev_bev'] is not None: + img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos'] + img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle'] + else: + img_metas[0][0]['can_bus'][-1] = 0 + img_metas[0][0]['can_bus'][:3] = 0 + + new_prev_bev, bbox_results = self.simple_test( + img_metas[0], img[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs) + # During inference, we save the BEV features and ego motion of each timestamp. + self.prev_frame_info['prev_pos'] = tmp_pos + self.prev_frame_info['prev_angle'] = tmp_angle + self.prev_frame_info['prev_bev'] = new_prev_bev + return bbox_results + + def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False): + """Test function""" + outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev) + + bbox_list = self.pts_bbox_head.get_bboxes( + outs, img_metas, rescale=rescale) + bbox_results = [ + bbox3d2result(bboxes, scores, labels) + for bboxes, scores, labels in bbox_list + ] + return outs['bev_embed'], bbox_results + + def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False): + """Test function without augmentaiton.""" + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + bbox_list = [dict() for i in range(len(img_metas))] + new_prev_bev, bbox_pts = self.simple_test_pts( + img_feats, img_metas, prev_bev, rescale=rescale) + for result_dict, pts_bbox in zip(bbox_list, bbox_pts): + result_dict['pts_bbox'] = pts_bbox + return new_prev_bev, bbox_list diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..6f0628168b4c5f0143d159d0f606d87cc5ee0cf5 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py @@ -0,0 +1,89 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from tkinter.messagebox import NO +import torch +from mmcv.runner import force_fp32, auto_fp16 +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +from projects.mmdet3d_plugin.bevformer.detectors.bevformer import BEVFormer +import time +import copy +import numpy as np +import mmdet3d +from projects.mmdet3d_plugin.models.utils.bricks import run_time + + +@DETECTORS.register_module() +class BEVFormer_fp16(BEVFormer): + """ + The default version BEVFormer currently can not support FP16. + We provide this version to resolve this issue. + """ + + @auto_fp16(apply_to=('img', 'prev_bev', 'points')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + prev_bev=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev=prev_bev) + losses.update(losses_pts) + return losses + + + def val_step(self, data, optimizer): + """ + In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`. + This is not the standard function of `val_step`. + """ + + img = data['img'] + img_metas = data['img_metas'] + img_feats = self.extract_feat(img=img, img_metas=img_metas) + prev_bev = data.get('prev_bev', None) + prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True) + return prev_bev \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa04ec16df5b0bb9f21cadf22f9172c3cc9a58c1 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py @@ -0,0 +1 @@ +from .custom_hooks import TransferWeight \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..a98ad1cb8d57c41c7145f93037d30e9fb9444265 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py @@ -0,0 +1,14 @@ +from mmcv.runner.hooks.hook import HOOKS, Hook +from projects.mmdet3d_plugin.models.utils import run_time + + +@HOOKS.register_module() +class TransferWeight(Hook): + + def __init__(self, every_n_inters=1): + self.every_n_inters=every_n_inters + + def after_train_iter(self, runner): + if self.every_n_inner_iters(runner, self.every_n_inters): + runner.eval_model.load_state_dict(runner.model.state_dict()) + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..09d1e8f30d095ef910cde4a3dff15f1bb8b45252 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py @@ -0,0 +1,7 @@ +from .transformer import PerceptionTransformer +from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D, MSIPM3D +from .temporal_self_attention import TemporalSelfAttention +from .encoder import BEVFormerEncoder, BEVFormerLayer +from .decoder import DetectionTransformerDecoder + + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..0f6bdeec795e073933fd8834f4bac984764a841d --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py @@ -0,0 +1,487 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import copy +import warnings + +import torch +import torch.nn as nn + +from mmcv import ConfigDict, deprecated_api_warning +from mmcv.cnn import Linear, build_activation_layer, build_norm_layer +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, + TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) + +# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file +try: + from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 + warnings.warn( + ImportWarning( + '``MultiScaleDeformableAttention`` has been moved to ' + '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 + '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 + 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 + )) +except ImportError: + warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' + '``mmcv.ops.multi_scale_deform_attn``, ' + 'You should install ``mmcv-full`` if you need this module. ') +from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention + + +@TRANSFORMER_LAYER.register_module() +class MyCustomBaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=True, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ') + ffn_cfgs[new_name] = kwargs[ori_name] + + super(MyCustomBaseTransformerLayer, self).__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & set( + ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index])) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + **kwargs contains some specific arguments of attentions. + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query + + + +@TRANSFORMER_LAYER.register_module() +class MyCustomBaseTransformerLayerWithoutSelfAttn(BaseModule): + """Base `TransformerLayer` for vision transformer. + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=True, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ') + ffn_cfgs[new_name] = kwargs[ori_name] + + super(MyCustomBaseTransformerLayerWithoutSelfAttn, self).__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & set( + ['norm', 'ffn', 'cross_attn']) == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all three operation type ' \ + f"{['norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index])) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + **kwargs contains some specific arguments of attentions. + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..236d33c92cfd39f2a5c00e88b2dc90e17425b6e4 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py @@ -0,0 +1,341 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import mmcv +import cv2 as cv +import copy +import warnings +from matplotlib import pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +# from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ +# MultiScaleDeformableAttnFunction_fp16 + +from mx_driving.fused import npu_multi_scale_deformable_attn_function #3 + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +def inverse_sigmoid(x, eps=1e-5): + """Inverse function of sigmoid. + Args: + x (Tensor): The tensor to do the + inverse. + eps (float): EPS avoid numerical + overflow. Defaults 1e-5. + Returns: + Tensor: The x has passed the inverse + function of sigmoid, has same + shape with input. + """ + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class DetectionTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR3D transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + super(DetectionTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.fp16_enabled = False + + def forward(self, + query, + *args, + reference_points=None, + reg_branches=None, + key_padding_mask=None, + **kwargs): + """Forward function for `Detr3DTransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + + reference_points_input = reference_points[..., :2].unsqueeze( + 2) # BS NUM_QUERY NUM_LEVEL 2 + output = layer( + output, + *args, + reference_points=reference_points_input, + key_padding_mask=key_padding_mask, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + + assert reference_points.shape[-1] == 3 + + new_reference_points = torch.zeros_like(reference_points) + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points[..., :2]) + new_reference_points[..., 2:3] = tmp[ + ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) + + new_reference_points = new_reference_points.sigmoid() + + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@ATTENTION.register_module() +class CustomMSDeformableAttention(BaseModule): + """An attention module used in Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + im2col_step=64, + dropout=0.1, + batch_first=False, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiScaleDeformableAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3 + + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + if not self.batch_first: + # (num_query, bs ,embed_dims) + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..979e2206d2ba02c6495c74b49385bb93436a6131 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py @@ -0,0 +1,404 @@ + +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from .custom_base_transformer_layer import MyCustomBaseTransformerLayer +import copy +import warnings +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +from mmcv.runner import force_fp32, auto_fp16 +import numpy as np +import torch +import cv2 as cv +import mmcv +from mmcv.utils import TORCH_VERSION, digit_version +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class BEVFormerEncoder(TransformerLayerSequence): + + """ + Attention with both self and cross + Implements the decoder in DETR transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes', + **kwargs): + + super(BEVFormerEncoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + + self.num_points_in_pillar = num_points_in_pillar + self.pc_range = pc_range + self.fp16_enabled = False + + @staticmethod + def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float): + """Get the reference points used in SCA and TSA. + Args: + H, W: spatial shape of bev. + Z: hight of pillar. + D: sample D points uniformly from each pillar. + device (obj:`device`): The device where + reference_points should be. + Returns: + Tensor: reference points used in decoder, has \ + shape (bs, num_keys, num_levels, 2). + """ + + # reference points in 3D space, used in spatial cross-attention (SCA) + if dim == '3d': + zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype, + device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z + xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype, + device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W + ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype, + device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H + ref_3d = torch.stack((xs, ys, zs), -1) + ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1) + ref_3d = ref_3d[None].repeat(bs, 1, 1, 1) + return ref_3d + + # reference points on 2D bev plane, used in temporal self-attention (TSA). + elif dim == '2d': + ref_y, ref_x = torch.meshgrid( + torch.linspace( + 0.5, H - 0.5, H, dtype=dtype, device=device), + torch.linspace( + 0.5, W - 0.5, W, dtype=dtype, device=device) + ) + ref_y = ref_y.reshape(-1)[None] / H + ref_x = ref_x.reshape(-1)[None] / W + ref_2d = torch.stack((ref_x, ref_y), -1) + ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2) + return ref_2d + + # This function must use fp32!!! + @force_fp32(apply_to=('reference_points', 'img_metas')) + def point_sampling(self, reference_points, pc_range, img_metas): + + lidar2img = [] + for img_meta in img_metas: + lidar2img.append(img_meta['lidar2img']) + lidar2img = np.asarray(lidar2img) + lidar2img = reference_points.new_tensor(lidar2img) # (B, N, 4, 4) + reference_points = reference_points.clone() + + reference_points[..., 0:1] = reference_points[..., 0:1] * \ + (pc_range[3] - pc_range[0]) + pc_range[0] + reference_points[..., 1:2] = reference_points[..., 1:2] * \ + (pc_range[4] - pc_range[1]) + pc_range[1] + reference_points[..., 2:3] = reference_points[..., 2:3] * \ + (pc_range[5] - pc_range[2]) + pc_range[2] + + reference_points = torch.cat( + (reference_points, torch.ones_like(reference_points[..., :1])), -1) + + reference_points = reference_points.permute(1, 0, 2, 3) + D, B, num_query = reference_points.size()[:3] + num_cam = lidar2img.size(1) + + reference_points = reference_points.view( + D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) + + lidar2img = lidar2img.view( + 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) + + reference_points_cam = torch.matmul(lidar2img.to(torch.float32), + reference_points.to(torch.float32)).squeeze(-1) + eps = 1e-5 + + bev_mask = (reference_points_cam[..., 2:3] > eps) + reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( + reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) + + reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] + reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] + + bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0) + & (reference_points_cam[..., 1:2] < 1.0) + & (reference_points_cam[..., 0:1] < 1.0) + & (reference_points_cam[..., 0:1] > 0.0)) + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + bev_mask = torch.nan_to_num(bev_mask) + else: + bev_mask = bev_mask.new_tensor( + np.nan_to_num(bev_mask.cpu().numpy())) + + reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4) + bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1) + + return reference_points_cam, bev_mask + + @auto_fp16() + def forward(self, + bev_query, + key, + value, + *args, + bev_h=None, + bev_w=None, + bev_pos=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + prev_bev=None, + shift=0., + **kwargs): + """Forward function for `TransformerDecoder`. + Args: + bev_query (Tensor): Input BEV query with shape + `(num_query, bs, embed_dims)`. + key & value (Tensor): Input multi-cameta features with shape + (num_cam, num_value, bs, embed_dims) + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + valid_ratios (Tensor): The radios of valid + points on the feature map, has shape + (bs, num_levels, 2) + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + + output = bev_query + intermediate = [] + + ref_3d = self.get_reference_points( + bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + ref_2d = self.get_reference_points( + bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + + reference_points_cam, bev_mask = self.point_sampling( + ref_3d, self.pc_range, kwargs['img_metas']) + + # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper. + # shift_ref_2d = ref_2d # .clone() + shift_ref_2d = ref_2d.clone() + shift_ref_2d += shift[:, None, None, :] + + # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims) + bev_query = bev_query.permute(1, 0, 2) + bev_pos = bev_pos.permute(1, 0, 2) + bs, len_bev, num_bev_level, _ = ref_2d.shape + if prev_bev is not None: + prev_bev = prev_bev.permute(1, 0, 2) + prev_bev = torch.stack( + [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1) + hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + else: + hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + + for lid, layer in enumerate(self.layers): + output = layer( + bev_query, + key, + value, + *args, + bev_pos=bev_pos, + ref_2d=hybird_ref_2d, + ref_3d=ref_3d, + bev_h=bev_h, + bev_w=bev_w, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + reference_points_cam=reference_points_cam, + bev_mask=bev_mask, + prev_bev=prev_bev, + **kwargs) + + bev_query = output + if self.return_intermediate: + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output + + +@TRANSFORMER_LAYER.register_module() +class BEVFormerLayer(MyCustomBaseTransformerLayer): + """Implements decoder layer in DETR transformer. + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): + Configs for self_attention or cross_attention, the order + should be consistent with it in `operation_order`. If it is + a dict, it would be expand to the number of attention in + `operation_order`. + feedforward_channels (int): The hidden dimension for FFNs. + ffn_dropout (float): Probability of an element to be zeroed + in ffn. Default 0.0. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. + Default: `LN`. + ffn_num_fcs (int): The number of fully-connected layers in FFNs. + Default:2. + """ + + def __init__(self, + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, + operation_order=None, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN'), + ffn_num_fcs=2, + **kwargs): + super(BEVFormerLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, + operation_order=operation_order, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs) + self.fp16_enabled = False + assert len(operation_order) == 6 + assert set(operation_order) == set( + ['self_attn', 'norm', 'cross_attn', 'ffn']) + + def forward(self, + query, + key=None, + value=None, + bev_pos=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + ref_2d=None, + ref_3d=None, + bev_h=None, + bev_w=None, + reference_points_cam=None, + mask=None, + spatial_shapes=None, + level_start_index=None, + prev_bev=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + # temporal self attention + if layer == 'self_attn': + + query = self.attentions[attn_index]( + query, + prev_bev, + prev_bev, + identity if self.pre_norm else None, + query_pos=bev_pos, + key_pos=bev_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + reference_points=ref_2d, + spatial_shapes=torch.tensor( + [[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + # spaital cross attention + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + reference_points=ref_3d, + reference_points_cam=reference_points_cam, + mask=mask, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py new file mode 100644 index 0000000000000000000000000000000000000000..59f917343b880994670312d57466c576a8f05d97 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py @@ -0,0 +1,163 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import torch +from torch.cuda.amp import custom_bwd, custom_fwd +from torch.autograd.function import Function, once_differentiable +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +class MultiScaleDeformableAttnFunction_fp16(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None + + +class MultiScaleDeformableAttnFunction_fp32(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float32) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..c2b45612f4f348b5e427c20fe1ffd4c68d3bb396 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py @@ -0,0 +1,615 @@ + +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import build_attention +import math +from mmcv.runner import force_fp32, auto_fp16 + +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.utils import ext_loader +# from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ +# MultiScaleDeformableAttnFunction_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + +from mx_driving.fused import npu_multi_scale_deformable_attn_function #3 + +@ATTENTION.register_module() +class SpatialCrossAttention(BaseModule): + """An attention module used in BEVFormer. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_cams (int): The number of cameras + dropout (float): A Dropout layer on `inp_residual`. + Default: 0.. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + deformable_attention: (dict): The config for the deformable attention used in SCA. + """ + + def __init__(self, + embed_dims=256, + num_cams=6, + pc_range=None, + dropout=0.1, + init_cfg=None, + batch_first=False, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=256, + num_levels=4), + **kwargs + ): + super(SpatialCrossAttention, self).__init__(init_cfg) + + self.init_cfg = init_cfg + self.dropout = nn.Dropout(dropout) + self.pc_range = pc_range + self.fp16_enabled = False + self.deformable_attention = build_attention(deformable_attention) + self.embed_dims = embed_dims + self.num_cams = num_cams + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.batch_first = batch_first + self.init_weight() + + def init_weight(self): + """Default initialization for Parameters of Module.""" + xavier_init(self.output_proj, distribution='uniform', bias=0.) + + @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) + def forward(self, + query, + key, + value, + residual=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + reference_points_cam=None, + bev_mask=None, + level_start_index=None, + flag='encoder', + **kwargs): + """Forward Function of Detr3DCrossAtten. + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. (B, N, C, H, W) + residual (Tensor): The tensor used for addition, with the + same shape as `x`. Default None. If None, `x` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, 4), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different level. With shape (num_levels, 2), + last dimension represent (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + + if residual is None: + inp_residual = query + slots = torch.zeros_like(query) + if query_pos is not None: + query = query + query_pos + + bs, num_query, _ = query.size() + + D = reference_points_cam.size(3) + indexes = [] + for i, mask_per_img in enumerate(bev_mask): + index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1) + indexes.append(index_query_per_img) + max_len = max([len(each) for each in indexes]) + + # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. + queries_rebatch = query.new_zeros( + [bs, self.num_cams, max_len, self.embed_dims]) + reference_points_rebatch = reference_points_cam.new_zeros( + [bs, self.num_cams, max_len, D, 2]) + + for j in range(bs): + for i, reference_points_per_img in enumerate(reference_points_cam): + index_query_per_img = indexes[i] + queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] + reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] + + num_cams, l, bs, embed_dims = key.shape + + key = key.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + value = value.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + + queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, + reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, + level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) + for j in range(bs): + for i, index_query_per_img in enumerate(indexes): + slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] + + count = bev_mask.sum(-1) > 0 + count = count.permute(1, 2, 0).sum(-1) + count = torch.clamp(count, min=1.0) + slots = slots / count[..., None] + slots = self.output_proj(slots) + + return self.dropout(slots) + inp_residual + + +@ATTENTION.register_module() +class MSDeformableAttention3D(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=8, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.batch_first = batch_first + self.output_proj = None + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + Args: + query (Tensor): Query of Transformer with shape + ( bs, num_query, embed_dims). + key (Tensor): The key tensor with shape + `(bs, num_key, embed_dims)`. + value (Tensor): The value tensor with shape + `(bs, num_key, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + + if reference_points.shape[-1] == 2: + """ + For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. + After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. + For each referent point, we sample `num_points` sampling points. + For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. + """ + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + + bs, num_query, num_Z_anchors, xy = reference_points.shape + reference_points = reference_points[:, :, None, None, None, :, :] + sampling_offsets = sampling_offsets / \ + offset_normalizer[None, None, None, :, None, :] + bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape + sampling_offsets = sampling_offsets.view( + bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) + sampling_locations = reference_points + sampling_offsets + bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape + assert num_all_points == num_points * num_Z_anchors + + sampling_locations = sampling_locations.view( + bs, num_query, num_heads, num_levels, num_all_points, xy) + + elif reference_points.shape[-1] == 4: + assert False + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + + # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 + # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points + # + + if torch.cuda.is_available() and value.is_cuda: + output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3 + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output + + + +@ATTENTION.register_module() +class MSIPM3D(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=8, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.batch_first = batch_first + self.output_proj = None + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + # self.sampling_offsets = nn.Linear( + # embed_dims, num_heads * num_levels * num_points * 2) + # self.attention_weights = nn.Linear(embed_dims, + # num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + # constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + # self.sampling_offsets.bias.data = grid_init.view(-1) + self.fixed_sampling_offsets = nn.Parameter(grid_init.view(-1), requires_grad=False) + # constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + Args: + query (Tensor): Query of Transformer with shape + ( bs, num_query, embed_dims). + key (Tensor): The key tensor with shape + `(bs, num_key, embed_dims)`. + value (Tensor): The value tensor with shape + `(bs, num_key, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.fixed_sampling_offsets.view( + 1, 1, self.num_heads, self.num_levels, self.num_points, 2).repeat( + bs, num_query, 1, 1, 1,1) + # attention_weights = self.attention_weights(query).view( + # bs, num_query, self.num_heads, self.num_levels * self.num_points) + attention_weights = query.new_ones((bs, num_query, self.num_heads, self.num_levels * self.num_points)) + attention_weights = attention_weights.softmax(-1) + # import pdb;pdb.set_trace() + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + + if reference_points.shape[-1] == 2: + """ + For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. + After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. + For each referent point, we sample `num_points` sampling points. + For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. + """ + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + + bs, num_query, num_Z_anchors, xy = reference_points.shape + reference_points = reference_points[:, :, None, None, None, :, :] + sampling_offsets = sampling_offsets / \ + offset_normalizer[None, None, None, :, None, :] + bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape + sampling_offsets = sampling_offsets.view( + bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) + sampling_locations = reference_points + sampling_offsets + bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape + assert num_all_points == num_points * num_Z_anchors + + sampling_locations = sampling_locations.view( + bs, num_query, num_heads, num_levels, num_all_points, xy) + + elif reference_points.shape[-1] == 4: + assert False + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + + # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 + # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points + # + + if torch.cuda.is_available() and value.is_cuda: + output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3 + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..094f46a5c1f34db77d453e30af29a09d84fe9867 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py @@ -0,0 +1,267 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from projects.mmdet3d_plugin.models.utils.bricks import run_time +# from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32 +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import ATTENTION +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + +from mx_driving.fused import npu_multi_scale_deformable_attn_function #3 + +@ATTENTION.register_module() +class TemporalSelfAttention(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to True. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV. + the length of BEV queue is 2. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + num_bev_queue=2, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.num_bev_queue = num_bev_queue + self.sampling_offsets = nn.Linear( + embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue, + num_bev_queue*num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1) + + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + assert self.batch_first + bs, len_bev, c = query.shape + value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c) + + # value = torch.cat([query, query], 0) + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + bs, num_query, embed_dims = query.shape + _, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + assert self.num_bev_queue == 2 + + query = torch.cat([value[:bs], query], -1) + value = self.value_proj(value) + + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + + value = value.reshape(bs*self.num_bev_queue, + num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query) + sampling_offsets = sampling_offsets.view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_bev_queue, + self.num_levels, + self.num_points) + + attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous() + sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3 + + else: + + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + # output shape (bs*num_bev_queue, num_query, embed_dims) + # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue) + output = output.permute(1, 2, 0) + + # fuse history value and current value + # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue) + output = output.view(num_query, embed_dims, bs, self.num_bev_queue) + output = output.mean(-1) + + # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims) + output = output.permute(2, 0, 1) + + output = self.output_proj(output) + + if not self.batch_first: + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/transformer.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..1f86127627ef764316348b09688a3e0ea3b38ae4 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/transformer.py @@ -0,0 +1,289 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence +from mmcv.runner.base_module import BaseModule + +from mmdet.models.utils.builder import TRANSFORMER +from torch.nn.init import normal_ +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.runner.base_module import BaseModule +from torchvision.transforms.functional import rotate +from .temporal_self_attention import TemporalSelfAttention +from .spatial_cross_attention import MSDeformableAttention3D +from .decoder import CustomMSDeformableAttention +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from mmcv.runner import force_fp32, auto_fp16 + + +@TRANSFORMER.register_module() +class PerceptionTransformer(BaseModule): + """Implements the Detr3D transformer. + Args: + as_two_stage (bool): Generate query from encoder features. + Default: False. + num_feature_levels (int): Number of feature maps from FPN: + Default: 4. + two_stage_num_proposals (int): Number of proposals when set + `as_two_stage` as True. Default: 300. + """ + + def __init__(self, + num_feature_levels=4, + num_cams=6, + two_stage_num_proposals=300, + encoder=None, + decoder=None, + embed_dims=256, + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + can_bus_norm=True, + use_cams_embeds=True, + rotate_center=[100, 100], + **kwargs): + super(PerceptionTransformer, self).__init__(**kwargs) + self.encoder = build_transformer_layer_sequence(encoder) + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = embed_dims + self.num_feature_levels = num_feature_levels + self.num_cams = num_cams + self.fp16_enabled = False + + self.rotate_prev_bev = rotate_prev_bev + self.use_shift = use_shift + self.use_can_bus = use_can_bus + self.can_bus_norm = can_bus_norm + self.use_cams_embeds = use_cams_embeds + + self.two_stage_num_proposals = two_stage_num_proposals + self.init_layers() + self.rotate_center = rotate_center + + def init_layers(self): + """Initialize layers of the Detr3DTransformer.""" + self.level_embeds = nn.Parameter(torch.Tensor( + self.num_feature_levels, self.embed_dims)) + self.cams_embeds = nn.Parameter( + torch.Tensor(self.num_cams, self.embed_dims)) + self.reference_points = nn.Linear(self.embed_dims, 3) + self.can_bus_mlp = nn.Sequential( + nn.Linear(18, self.embed_dims // 2), + nn.ReLU(inplace=True), + nn.Linear(self.embed_dims // 2, self.embed_dims), + nn.ReLU(inplace=True), + ) + if self.can_bus_norm: + self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims)) + + def init_weights(self): + """Initialize the transformer weights.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \ + or isinstance(m, CustomMSDeformableAttention): + try: + m.init_weight() + except AttributeError: + m.init_weights() + normal_(self.level_embeds) + normal_(self.cams_embeds) + xavier_init(self.reference_points, distribution='uniform', bias=0.) + xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.) + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) + def get_bev_features( + self, + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + prev_bev=None, + **kwargs): + """ + obtain bev features. + """ + + bs = mlvl_feats[0].size(0) + bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) + bev_pos = bev_pos.flatten(2).permute(2, 0, 1) + + # obtain rotation angle and shift with ego motion + delta_x = np.array([each['can_bus'][0] + for each in kwargs['img_metas']]) + delta_y = np.array([each['can_bus'][1] + for each in kwargs['img_metas']]) + ego_angle = np.array( + [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']]) + grid_length_y = grid_length[0] + grid_length_x = grid_length[1] + translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2) + translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180 + bev_angle = ego_angle - translation_angle + shift_y = translation_length * \ + np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h + shift_x = translation_length * \ + np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w + shift_y = shift_y * self.use_shift + shift_x = shift_x * self.use_shift + shift = bev_queries.new_tensor( + [shift_x, shift_y]).permute(1, 0) # xy, bs -> bs, xy + + if prev_bev is not None: + if prev_bev.shape[1] == bev_h * bev_w: + prev_bev = prev_bev.permute(1, 0, 2) + if self.rotate_prev_bev: + for i in range(bs): + # num_prev_bev = prev_bev.size(1) + rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + tmp_prev_bev = prev_bev[:, i].reshape( + bev_h, bev_w, -1).permute(2, 0, 1) + tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, + center=self.rotate_center) + tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( + bev_h * bev_w, 1, -1) + prev_bev[:, i] = tmp_prev_bev[:, 0] + + # add can bus signals + can_bus = bev_queries.new_tensor( + [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] + can_bus = self.can_bus_mlp(can_bus)[None, :, :] + bev_queries = bev_queries + can_bus * self.use_can_bus + + feat_flatten = [] + spatial_shapes = [] + for lvl, feat in enumerate(mlvl_feats): + bs, num_cam, c, h, w = feat.shape + spatial_shape = (h, w) + feat = feat.flatten(3).permute(1, 0, 3, 2) + if self.use_cams_embeds: + feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) + feat = feat + self.level_embeds[None, + None, lvl:lvl + 1, :].to(feat.dtype) + spatial_shapes.append(spatial_shape) + feat_flatten.append(feat) + + feat_flatten = torch.cat(feat_flatten, 2) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=bev_pos.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + feat_flatten = feat_flatten.permute( + 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) + + bev_embed = self.encoder( + bev_queries, + feat_flatten, + feat_flatten, + bev_h=bev_h, + bev_w=bev_w, + bev_pos=bev_pos, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + prev_bev=prev_bev, + shift=shift, + **kwargs + ) + + return bev_embed + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) + def forward(self, + mlvl_feats, + bev_queries, + object_query_embed, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + reg_branches=None, + cls_branches=None, + prev_bev=None, + **kwargs): + """Forward function for `Detr3DTransformer`. + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, num_cams, embed_dims, h, w]. + bev_queries (Tensor): (bev_h*bev_w, c) + bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) + object_query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when `with_box_refine` is True. Default to None. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - bev_embed: BEV features + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + + bev_embed = self.get_bev_features( + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=grid_length, + bev_pos=bev_pos, + prev_bev=prev_bev, + **kwargs) # bev_embed shape: bs, bev_h*bev_w, embed_dims + + bs = mlvl_feats[0].size(0) + query_pos, query = torch.split( + object_query_embed, self.embed_dims, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos) + reference_points = reference_points.sigmoid() + init_reference_out = reference_points + + query = query.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + bev_embed = bev_embed.permute(1, 0, 2) + + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=bev_embed, + query_pos=query_pos, + reference_points=reference_points, + reg_branches=reg_branches, + cls_branches=cls_branches, + spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + + inter_references_out = inter_references + + return bev_embed, inter_states, init_reference_out, inter_references_out diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..03f906ce601e2dfac207af680774086067808830 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/__init__.py @@ -0,0 +1 @@ +from .epoch_based_runner import EpochBasedRunner_video \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..5d167e2e1c63beed011e75bedb05f43817fe797b --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import os.path as osp +import torch +import mmcv +from mmcv.runner.base_runner import BaseRunner +from mmcv.runner.epoch_based_runner import EpochBasedRunner +from mmcv.runner.builder import RUNNERS +from mmcv.runner.checkpoint import save_checkpoint +from mmcv.runner.utils import get_host_info +from pprint import pprint +from mmcv.parallel.data_container import DataContainer + + +@RUNNERS.register_module() +class EpochBasedRunner_video(EpochBasedRunner): + + ''' + # basic logic + + input_sequence = [a, b, c] # given a sequence of samples + + prev_bev = None + for each in input_sequcene[:-1] + prev_bev = eval_model(each, prev_bev)) # inference only. + + model(input_sequcene[-1], prev_bev) # train the last sample. + ''' + + def __init__(self, + model, + eval_model=None, + batch_processor=None, + optimizer=None, + work_dir=None, + logger=None, + meta=None, + keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], + max_iters=None, + max_epochs=None): + super().__init__(model, + batch_processor, + optimizer, + work_dir, + logger, + meta, + max_iters, + max_epochs) + keys.append('img_metas') + self.keys = keys + self.eval_model = eval_model + self.eval_model.eval() + + def run_iter(self, data_batch, train_mode, **kwargs): + if self.batch_processor is not None: + assert False + # outputs = self.batch_processor( + # self.model, data_batch, train_mode=train_mode, **kwargs) + elif train_mode: + + num_samples = data_batch['img'].data[0].size(1) + data_list = [] + prev_bev = None + for i in range(num_samples): + data = {} + for key in self.keys: + if key not in ['img_metas', 'img', 'points']: + data[key] = data_batch[key] + else: + if key == 'img': + data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True) + elif key == 'img_metas': + data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only) + else: + assert False + data_list.append(data) + with torch.no_grad(): + for i in range(num_samples-1): + if i>0: data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs) + + data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs) + else: + assert False + # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) + + if not isinstance(outputs, dict): + raise TypeError('"batch_processor()" or "model.train_step()"' + 'and "model.val_step()" must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..23156d8d589a4102824d650e76cd1d0ecba3ff49 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py @@ -0,0 +1,3 @@ +from .hungarian_assigner_3d import HungarianAssigner3D + +__all__ = ['HungarianAssigner3D'] diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..583fcab72f6b2bbf20bda90b8f877cc1f81072d9 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py @@ -0,0 +1,136 @@ +import torch + +from mmdet.core.bbox.builder import BBOX_ASSIGNERS +from mmdet.core.bbox.assigners import AssignResult +from mmdet.core.bbox.assigners import BaseAssigner +from mmdet.core.bbox.match_costs import build_match_cost +from mmdet.models.utils.transformer import inverse_sigmoid +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox + +try: + from scipy.optimize import linear_sum_assignment +except ImportError: + linear_sum_assignment = None + + +@BBOX_ASSIGNERS.register_module() +class HungarianAssigner3D(BaseAssigner): + """Computes one-to-one matching between predictions and ground truth. + This class computes an assignment between the targets and the predictions + based on the costs. The costs are weighted sum of three components: + classification cost, regression L1 cost and regression iou cost. The + targets don't include the no_object, so generally there are more + predictions than targets. After the one-to-one matching, the un-matched + are treated as backgrounds. Thus each query prediction will be assigned + with `0` or a positive integer indicating the ground truth index: + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + Args: + cls_weight (int | float, optional): The scale factor for classification + cost. Default 1.0. + bbox_weight (int | float, optional): The scale factor for regression + L1 cost. Default 1.0. + iou_weight (int | float, optional): The scale factor for regression + iou cost. Default 1.0. + iou_calculator (dict | optional): The config for the iou calculation. + Default type `BboxOverlaps2D`. + iou_mode (str | optional): "iou" (intersection over union), "iof" + (intersection over foreground), or "giou" (generalized + intersection over union). Default "giou". + """ + + def __init__(self, + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=1.0), + iou_cost=dict(type='IoUCost', weight=0.0), + pc_range=None): + self.cls_cost = build_match_cost(cls_cost) + self.reg_cost = build_match_cost(reg_cost) + self.iou_cost = build_match_cost(iou_cost) + self.pc_range = pc_range + + def assign(self, + bbox_pred, + cls_pred, + gt_bboxes, + gt_labels, + gt_bboxes_ignore=None, + eps=1e-7): + """Computes one-to-one matching based on the weighted costs. + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + 1. assign every prediction to -1 + 2. compute the weighted costs + 3. do Hungarian matching on CPU based on the costs + 4. assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + cls_pred (Tensor): Predicted classification logits, shape + [num_query, num_class]. + gt_bboxes (Tensor): Ground truth boxes with unnormalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`. Default None. + eps (int | float, optional): A value added to the denominator for + numerical stability. Default 1e-7. + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert gt_bboxes_ignore is None, \ + 'Only case when gt_bboxes_ignore is None is supported.' + num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) + + # 1. assign -1 by default + assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + assigned_labels = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels) + + # 2. compute the weighted costs + # classification and bboxcost. + cls_cost = self.cls_cost(cls_pred, gt_labels) + # regression L1 cost + + normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range) + + reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) + + # weighted sum of above two costs + cost = cls_cost + reg_cost + + # 3. do Hungarian matching on CPU using linear_sum_assignment + cost = cost.detach().cpu() + if linear_sum_assignment is None: + raise ImportError('Please run "pip install scipy" ' + 'to install scipy first.') + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + matched_row_inds = torch.from_numpy(matched_row_inds).to( + bbox_pred.device) + matched_col_inds = torch.from_numpy(matched_col_inds).to( + bbox_pred.device) + + # 4. assign backgrounds and foregrounds + # assign all indices to backgrounds first + assigned_gt_inds[:] = 0 + # assign foregrounds based on matching results + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels) \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2be78ce5389f62d716cc5f1aded821641aa88dea --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/__init__.py @@ -0,0 +1,3 @@ +from .nms_free_coder import NMSFreeCoder, MapTRNMSFreeCoder + +__all__ = ['NMSFreeCoder', 'MapTRNMSFreeCoder'] diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..dfcdae8cfa213b5d87f9dbbb8751b2cdf90e7c3a --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py @@ -0,0 +1,283 @@ +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS +from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox +import numpy as np +from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy + + +def normalize_2d_bbox(bboxes, pc_range): + + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes) + cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0] + cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1] + factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h]) + + normalized_bboxes = cxcywh_bboxes / factor + return normalized_bboxes + +def normalize_2d_pts(pts, pc_range): + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + new_pts = pts.clone() + new_pts[...,0:1] = pts[..., 0:1] - pc_range[0] + new_pts[...,1:2] = pts[...,1:2] - pc_range[1] + factor = pts.new_tensor([patch_w, patch_h]) + normalized_pts = new_pts / factor + return normalized_pts + +def denormalize_2d_bbox(bboxes, pc_range): + + bboxes = bbox_cxcywh_to_xyxy(bboxes) + bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + + return bboxes +def denormalize_2d_pts(pts, pc_range): + new_pts = pts.clone() + new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + return new_pts + +@BBOX_CODERS.register_module() +class NMSFreeCoder(BaseBBoxCoder): + """Bbox coder for NMS-free detector. + Args: + pc_range (list[float]): Range of point cloud. + post_center_range (list[float]): Limit of the center. + Default: None. + max_num (int): Max number to be kept. Default: 100. + score_threshold (float): Threshold to filter boxes based on score. + Default: None. + code_size (int): Code size of bboxes. Default: 9 + """ + + def __init__(self, + pc_range, + voxel_size=None, + post_center_range=None, + max_num=100, + score_threshold=None, + num_classes=10): + self.pc_range = pc_range + self.voxel_size = voxel_size + self.post_center_range = post_center_range + self.max_num = max_num + self.score_threshold = score_threshold + self.num_classes = num_classes + + def encode(self): + + pass + + def decode_single(self, cls_scores, bbox_preds): + """Decode bboxes. + Args: + cls_scores (Tensor): Outputs from the classification head, \ + shape [num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + bbox_preds (Tensor): Outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + max_num = self.max_num + + cls_scores = cls_scores.sigmoid() + scores, indexs = cls_scores.view(-1).topk(max_num) + labels = indexs % self.num_classes + bbox_index = indexs // self.num_classes + bbox_preds = bbox_preds[bbox_index] + + final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) + final_scores = scores + final_preds = labels + + # use score threshold + if self.score_threshold is not None: + thresh_mask = final_scores > self.score_threshold + tmp_score = self.score_threshold + while thresh_mask.sum() == 0: + tmp_score *= 0.9 + if tmp_score < 0.01: + thresh_mask = final_scores > -1 + break + thresh_mask = final_scores >= tmp_score + + if self.post_center_range is not None: + self.post_center_range = torch.tensor( + self.post_center_range, device=scores.device) + mask = (final_box_preds[..., :3] >= + self.post_center_range[:3]).all(1) + mask &= (final_box_preds[..., :3] <= + self.post_center_range[3:]).all(1) + + if self.score_threshold: + mask &= thresh_mask + + boxes3d = final_box_preds[mask] + scores = final_scores[mask] + + labels = final_preds[mask] + predictions_dict = { + 'bboxes': boxes3d, + 'scores': scores, + 'labels': labels + } + + else: + raise NotImplementedError( + 'Need to reorganize output as a batch, only ' + 'support post_center_range is not None for now!') + return predictions_dict + + def decode(self, preds_dicts): + """Decode bboxes. + Args: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + all_cls_scores = preds_dicts['all_cls_scores'][-1] + all_bbox_preds = preds_dicts['all_bbox_preds'][-1] + + batch_size = all_cls_scores.size()[0] + predictions_list = [] + for i in range(batch_size): + predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i])) + return predictions_list + + +@BBOX_CODERS.register_module() +class MapTRNMSFreeCoder(BaseBBoxCoder): + """Bbox coder for NMS-free detector. + Args: + pc_range (list[float]): Range of point cloud. + post_center_range (list[float]): Limit of the center. + Default: None. + max_num (int): Max number to be kept. Default: 100. + score_threshold (float): Threshold to filter boxes based on score. + Default: None. + code_size (int): Code size of bboxes. Default: 9 + """ + + def __init__(self, + pc_range, + voxel_size=None, + post_center_range=None, + max_num=100, + score_threshold=None, + num_classes=10): + self.pc_range = pc_range + self.voxel_size = voxel_size + self.post_center_range = post_center_range + self.max_num = max_num + self.score_threshold = score_threshold + self.num_classes = num_classes + + def encode(self): + + pass + + def decode_single(self, cls_scores, bbox_preds, pts_preds): + """Decode bboxes. + Args: + cls_scores (Tensor): Outputs from the classification head, \ + shape [num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + bbox_preds (Tensor): Outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [num_query, 9]. + pts_preds (Tensor): + Shape [num_query, fixed_num_pts, 2] + Returns: + list[dict]: Decoded boxes. + """ + max_num = self.max_num + + cls_scores = cls_scores.sigmoid() + scores, indexs = cls_scores.view(-1).topk(max_num) + labels = indexs % self.num_classes + bbox_index = indexs // self.num_classes + bbox_preds = bbox_preds[bbox_index] + pts_preds = pts_preds[bbox_index] + + final_box_preds = denormalize_2d_bbox(bbox_preds, self.pc_range) + final_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) #num_q,num_p,2 + # final_box_preds = bbox_preds + final_scores = scores + final_preds = labels + + # use score threshold + if self.score_threshold is not None: + thresh_mask = final_scores > self.score_threshold + tmp_score = self.score_threshold + while thresh_mask.sum() == 0: + tmp_score *= 0.9 + if tmp_score < 0.01: + thresh_mask = final_scores > -1 + break + thresh_mask = final_scores >= tmp_score + + if self.post_center_range is not None: + self.post_center_range = torch.tensor( + self.post_center_range, device=scores.device) + mask = (final_box_preds[..., :4] >= + self.post_center_range[:4]).all(1) + mask &= (final_box_preds[..., :4] <= + self.post_center_range[4:]).all(1) + + if self.score_threshold: + mask &= thresh_mask + + boxes3d = final_box_preds[mask] + scores = final_scores[mask] + pts = final_pts_preds[mask] + labels = final_preds[mask] + predictions_dict = { + 'bboxes': boxes3d, + 'scores': scores, + 'labels': labels, + 'pts': pts, + } + + else: + raise NotImplementedError( + 'Need to reorganize output as a batch, only ' + 'support post_center_range is not None for now!') + return predictions_dict + + def decode(self, preds_dicts): + """Decode bboxes. + Args: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + all_cls_scores = preds_dicts['all_cls_scores'][-1] + all_bbox_preds = preds_dicts['all_bbox_preds'][-1] + all_pts_preds = preds_dicts['all_pts_preds'][-1] + batch_size = all_cls_scores.size()[0] + predictions_list = [] + for i in range(batch_size): + predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i],all_pts_preds[i])) + return predictions_list + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aac1a82a64f467a47e39d7e862357459e84abb84 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py @@ -0,0 +1,4 @@ +from mmdet.core.bbox.match_costs import build_match_cost +from .match_cost import BBox3DL1Cost + +__all__ = ['build_match_cost', 'BBox3DL1Cost'] \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..d9678f3c7f666255540762d4064f0f7d82b920ed --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py @@ -0,0 +1,27 @@ +import torch +from mmdet.core.bbox.match_costs.builder import MATCH_COST + + +@MATCH_COST.register_module() +class BBox3DL1Cost(object): + """BBox3DL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/util.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/util.py new file mode 100644 index 0000000000000000000000000000000000000000..c54bd750246f3d6e2249b7d39888fffa6227beda --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/util.py @@ -0,0 +1,53 @@ +import torch + + +def normalize_bbox(bboxes, pc_range): + + cx = bboxes[..., 0:1] + cy = bboxes[..., 1:2] + cz = bboxes[..., 2:3] + w = bboxes[..., 3:4].log() + l = bboxes[..., 4:5].log() + h = bboxes[..., 5:6].log() + + rot = bboxes[..., 6:7] + if bboxes.size(-1) > 7: + vx = bboxes[..., 7:8] + vy = bboxes[..., 8:9] + normalized_bboxes = torch.cat( + (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1 + ) + else: + normalized_bboxes = torch.cat( + (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1 + ) + return normalized_bboxes + +def denormalize_bbox(normalized_bboxes, pc_range): + # rotation + rot_sine = normalized_bboxes[..., 6:7] + + rot_cosine = normalized_bboxes[..., 7:8] + rot = torch.atan2(rot_sine, rot_cosine) + + # center in the bev + cx = normalized_bboxes[..., 0:1] + cy = normalized_bboxes[..., 1:2] + cz = normalized_bboxes[..., 4:5] + + # size + w = normalized_bboxes[..., 2:3] + l = normalized_bboxes[..., 3:4] + h = normalized_bboxes[..., 5:6] + + w = w.exp() + l = l.exp() + h = h.exp() + if normalized_bboxes.size(-1) > 8: + # velocity + vx = normalized_bboxes[:, 8:9] + vy = normalized_bboxes[:, 9:10] + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) + else: + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) + return denormalized_bboxes \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d92421c7e84fdc7a33e94aa10fddfccb332d6399 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/__init__.py @@ -0,0 +1 @@ +from .eval_hooks import CustomDistEvalHook \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..0477213bd3fe5874c7fe8c7c4fe2d861165e2d58 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py @@ -0,0 +1,91 @@ + +# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16, +# in order to avoid strong version dependency, we did not directly +# inherit EvalHook but BaseDistEvalHook. + +import bisect +import os.path as osp + +import mmcv +import torch.distributed as dist +from mmcv.runner import DistEvalHook as BaseDistEvalHook +from mmcv.runner import EvalHook as BaseEvalHook +from torch.nn.modules.batchnorm import _BatchNorm +from mmdet.core.evaluation.eval_hooks import DistEvalHook + + +def _calc_dynamic_intervals(start_interval, dynamic_interval_list): + assert mmcv.is_list_of(dynamic_interval_list, tuple) + + dynamic_milestones = [0] + dynamic_milestones.extend( + [dynamic_interval[0] for dynamic_interval in dynamic_interval_list]) + dynamic_intervals = [start_interval] + dynamic_intervals.extend( + [dynamic_interval[1] for dynamic_interval in dynamic_interval_list]) + return dynamic_milestones, dynamic_intervals + + +class CustomDistEvalHook(BaseDistEvalHook): + + def __init__(self, *args, dynamic_intervals=None, **kwargs): + super(CustomDistEvalHook, self).__init__(*args, **kwargs) + self.use_dynamic_intervals = dynamic_intervals is not None + if self.use_dynamic_intervals: + self.dynamic_milestones, self.dynamic_intervals = \ + _calc_dynamic_intervals(self.interval, dynamic_intervals) + + def _decide_interval(self, runner): + if self.use_dynamic_intervals: + progress = runner.epoch if self.by_epoch else runner.iter + step = bisect.bisect(self.dynamic_milestones, (progress + 1)) + # Dynamically modify the evaluation interval + self.interval = self.dynamic_intervals[step - 1] + + def before_train_epoch(self, runner): + """Evaluate the model only at the start of training by epoch.""" + self._decide_interval(runner) + super().before_train_epoch(runner) + + def before_train_iter(self, runner): + self._decide_interval(runner) + super().before_train_iter(runner) + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + # Synchronization of BatchNorm's buffer (running_mean + # and running_var) is not supported in the DDP of pytorch, + # which may cause the inconsistent performance of models in + # different ranks, so we broadcast BatchNorm's buffers + # of rank 0 to other ranks to avoid this. + if self.broadcast_bn_buffer: + model = runner.model + for name, module in model.named_modules(): + if isinstance(module, + _BatchNorm) and module.track_running_stats: + dist.broadcast(module.running_var, 0) + dist.broadcast(module.running_mean, 0) + + if not self._should_evaluate(runner): + return + + tmpdir = self.tmpdir + if tmpdir is None: + tmpdir = osp.join(runner.work_dir, '.eval_hook') + + from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test # to solve circlur import + + results = custom_multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=tmpdir, + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + + key_score = self.evaluate(runner, results) + + if self.save_best: + self._save_ckpt(runner, key_score) + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py new file mode 100644 index 0000000000000000000000000000000000000000..f816974544b57c1561a1fc09b9cf9e48dde03e38 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py @@ -0,0 +1,251 @@ +# Copyright (c) OpenMMLab. All rights reserved. +r"""Adapted from `Waymo to KITTI converter + `_. +""" + +try: + from waymo_open_dataset import dataset_pb2 as open_dataset + import mmcv + import numpy as np + import tensorflow as tf + from glob import glob + from os.path import join + from waymo_open_dataset import label_pb2 + from waymo_open_dataset.protos import metrics_pb2 +except ImportError: + #pass + raise ImportError( + 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" ' + 'to install the official devkit first.') + + + + +class KITTI2Waymo(object): + """KITTI predictions to Waymo converter. + This class serves as the converter to change predictions from KITTI to + Waymo format. + Args: + kitti_result_files (list[dict]): Predictions in KITTI format. + waymo_tfrecords_dir (str): Directory to load waymo raw data. + waymo_results_save_dir (str): Directory to save converted predictions + in waymo format (.bin files). + waymo_results_final_path (str): Path to save combined + predictions in waymo format (.bin file), like 'a/b/c.bin'. + prefix (str): Prefix of filename. In general, 0 for training, 1 for + validation and 2 for testing. + workers (str): Number of parallel processes. + """ + + def __init__(self, + kitti_result_files, + waymo_tfrecords_dir, + waymo_results_save_dir, + waymo_results_final_path, + prefix, + workers=64): + + self.kitti_result_files = kitti_result_files + self.waymo_tfrecords_dir = waymo_tfrecords_dir + self.waymo_results_save_dir = waymo_results_save_dir + self.waymo_results_final_path = waymo_results_final_path + self.prefix = prefix + self.workers = int(workers) + self.name2idx = {} + for idx, result in enumerate(kitti_result_files): + if len(result['sample_idx']) > 0: + self.name2idx[str(result['sample_idx'][0])] = idx + + # turn on eager execution for older tensorflow versions + if int(tf.__version__.split('.')[0]) < 2: + tf.enable_eager_execution() + + self.k2w_cls_map = { + 'Car': label_pb2.Label.TYPE_VEHICLE, + 'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN, + 'Sign': label_pb2.Label.TYPE_SIGN, + 'Cyclist': label_pb2.Label.TYPE_CYCLIST, + } + + self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0], + [-1.0, 0.0, 0.0, 0.0], + [0.0, -1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0]]) + + self.get_file_names() + self.create_folder() + + def get_file_names(self): + """Get file names of waymo raw data.""" + self.waymo_tfrecord_pathnames = sorted( + glob(join(self.waymo_tfrecords_dir, '*.tfrecord'))) + print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.') + + def create_folder(self): + """Create folder for data conversion.""" + mmcv.mkdir_or_exist(self.waymo_results_save_dir) + + def parse_objects(self, kitti_result, T_k2w, context_name, + frame_timestamp_micros): + """Parse one prediction with several instances in kitti format and + convert them to `Object` proto. + Args: + kitti_result (dict): Predictions in kitti format. + - name (np.ndarray): Class labels of predictions. + - dimensions (np.ndarray): Height, width, length of boxes. + - location (np.ndarray): Bottom center of boxes (x, y, z). + - rotation_y (np.ndarray): Orientation of boxes. + - score (np.ndarray): Scores of predictions. + T_k2w (np.ndarray): Transformation matrix from kitti to waymo. + context_name (str): Context name of the frame. + frame_timestamp_micros (int): Frame timestamp. + Returns: + :obj:`Object`: Predictions in waymo dataset Object proto. + """ + + def parse_one_object(instance_idx): + """Parse one instance in kitti format and convert them to `Object` + proto. + Args: + instance_idx (int): Index of the instance to be converted. + Returns: + :obj:`Object`: Predicted instance in waymo dataset \ + Object proto. + """ + cls = kitti_result['name'][instance_idx] + length = round(kitti_result['dimensions'][instance_idx, 0], 4) + height = round(kitti_result['dimensions'][instance_idx, 1], 4) + width = round(kitti_result['dimensions'][instance_idx, 2], 4) + x = round(kitti_result['location'][instance_idx, 0], 4) + y = round(kitti_result['location'][instance_idx, 1], 4) + z = round(kitti_result['location'][instance_idx, 2], 4) + rotation_y = round(kitti_result['rotation_y'][instance_idx], 4) + score = round(kitti_result['score'][instance_idx], 4) + + # y: downwards; move box origin from bottom center (kitti) to + # true center (waymo) + y -= height / 2 + # frame transformation: kitti -> waymo + x, y, z = self.transform(T_k2w, x, y, z) + + # different conventions + heading = -(rotation_y + np.pi / 2) + while heading < -np.pi: + heading += 2 * np.pi + while heading > np.pi: + heading -= 2 * np.pi + + box = label_pb2.Label.Box() + box.center_x = x + box.center_y = y + box.center_z = z + box.length = length + box.width = width + box.height = height + box.heading = heading + + o = metrics_pb2.Object() + o.object.box.CopyFrom(box) + o.object.type = self.k2w_cls_map[cls] + o.score = score + + o.context_name = context_name + o.frame_timestamp_micros = frame_timestamp_micros + + return o + + objects = metrics_pb2.Objects() + + for instance_idx in range(len(kitti_result['name'])): + o = parse_one_object(instance_idx) + objects.objects.append(o) + + return objects + + def convert_one(self, file_idx): + """Convert action for single file. + Args: + file_idx (int): Index of the file to be converted. + """ + file_pathname = self.waymo_tfrecord_pathnames[file_idx] + file_data = tf.data.TFRecordDataset(file_pathname, compression_type='') + + for frame_num, frame_data in enumerate(file_data): + frame = open_dataset.Frame() + frame.ParseFromString(bytearray(frame_data.numpy())) + filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}' + + for camera in frame.context.camera_calibrations: + # FRONT = 1, see dataset.proto for details + if camera.name == 1: + T_front_cam_to_vehicle = np.array( + camera.extrinsic.transform).reshape(4, 4) + + T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam + + context_name = frame.context.name + frame_timestamp_micros = frame.timestamp_micros + + if filename in self.name2idx: + kitti_result = \ + self.kitti_result_files[self.name2idx[filename]] + objects = self.parse_objects(kitti_result, T_k2w, context_name, + frame_timestamp_micros) + else: + print(filename, 'not found.(bevformer)') + objects = metrics_pb2.Objects() + + with open( + join(self.waymo_results_save_dir, f'{filename}.bin'), + 'wb') as f: + f.write(objects.SerializeToString()) + + def convert(self): + """Convert action.""" + print('Start converting ...') + mmcv.track_parallel_progress(self.convert_one, range(len(self)), + self.workers) + print('\nFinished ...') + + # combine all files into one .bin + pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin'))) + combined = self.combine(pathnames) + + with open(self.waymo_results_final_path, 'wb') as f: + f.write(combined.SerializeToString()) + + def __len__(self): + """Length of the filename list.""" + return len(self.waymo_tfrecord_pathnames) + + def transform(self, T, x, y, z): + """Transform the coordinates with matrix T. + Args: + T (np.ndarray): Transformation matrix. + x(float): Coordinate in x axis. + y(float): Coordinate in y axis. + z(float): Coordinate in z axis. + Returns: + list: Coordinates after transformation. + """ + pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1) + pt_aft = np.matmul(T, pt_bef) + return pt_aft[:3].flatten().tolist() + + def combine(self, pathnames): + """Combine predictions in waymo format for each sample together. + Args: + pathnames (str): Paths to save predictions. + Returns: + :obj:`Objects`: Combined predictions in Objects proto. + """ + combined = metrics_pb2.Objects() + + for pathname in pathnames: + objects = metrics_pb2.Objects() + with open(pathname, 'rb') as f: + objects.ParseFromString(f.read()) + for o in objects.objects: + combined.objects.append(o) + + return combined \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..df87ab875149e973f14cd2c1fad764f23327d3be --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/__init__.py @@ -0,0 +1,8 @@ +from .nuscenes_dataset import CustomNuScenesDataset +from .builder import custom_build_dataset + +from .nuscenes_map_dataset import CustomNuScenesLocalMapDataset +from .av2_map_dataset import CustomAV2LocalMapDataset +__all__ = [ + 'CustomNuScenesDataset','CustomNuScenesLocalMapDataset' +] diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/av2_map_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/av2_map_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..dd5339c602ad547bfed178ce48ed8e73dd55824b --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/av2_map_dataset.py @@ -0,0 +1,1525 @@ +import copy + +import numpy as np +from mmdet.datasets import DATASETS +from mmdet3d.datasets import NuScenesDataset +import mmcv +import os +from os import path as osp +from mmdet.datasets import DATASETS +import torch +import numpy as np +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from .nuscnes_eval import NuScenesEval_custom +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.parallel import DataContainer as DC +import random + +from .nuscenes_dataset import CustomNuScenesDataset +from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from shapely import affinity, ops +from shapely.geometry import Polygon, LineString, box, MultiPolygon, MultiLineString +from mmdet.datasets.pipelines import to_tensor +import json + +from pathlib import Path +from av2.datasets.sensor.av2_sensor_dataloader import AV2SensorDataLoader +from av2.map.lane_segment import LaneMarkType, LaneSegment +from av2.map.map_api import ArgoverseStaticMap +from av2.geometry.se3 import SE3 +import av2.geometry.interpolate as interp_utils + + +class LiDARInstanceLines(object): + """Line instance in LIDAR coordinates + + """ + def __init__(self, + instance_line_list, + sample_dist=1, + num_samples=250, + padding=False, + fixed_num=-1, + padding_value=-10000, + patch_size=None): + assert isinstance(instance_line_list, list) + assert patch_size is not None + if len(instance_line_list) != 0: + assert isinstance(instance_line_list[0], LineString) + self.patch_size = patch_size + self.max_x = self.patch_size[1] / 2 + self.max_y = self.patch_size[0] / 2 + self.sample_dist = sample_dist + self.num_samples = num_samples + self.padding = padding + self.fixed_num = fixed_num + self.padding_value = padding_value + + self.instance_list = instance_line_list + + @property + def start_end_points(self): + """ + return torch.Tensor([N,4]), in xstart, ystart, xend, yend form + """ + assert len(self.instance_list) != 0 + instance_se_points_list = [] + for instance in self.instance_list: + se_points = [] + se_points.extend(instance.coords[0]) + se_points.extend(instance.coords[-1]) + instance_se_points_list.append(se_points) + instance_se_points_array = np.array(instance_se_points_list) + instance_se_points_tensor = to_tensor(instance_se_points_array) + instance_se_points_tensor = instance_se_points_tensor.to( + dtype=torch.float32) + instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x) + instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y) + instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x) + instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y) + return instance_se_points_tensor + + @property + def bbox(self): + """ + return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form + """ + assert len(self.instance_list) != 0 + instance_bbox_list = [] + for instance in self.instance_list: + # bounds is bbox: [xmin, ymin, xmax, ymax] + instance_bbox_list.append(instance.bounds) + instance_bbox_array = np.array(instance_bbox_list) + instance_bbox_tensor = to_tensor(instance_bbox_array) + instance_bbox_tensor = instance_bbox_tensor.to( + dtype=torch.float32) + instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x) + instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y) + instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x) + instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y) + return instance_bbox_tensor + + @property + def fixed_num_sampled_points(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + # instance_array = np.array(list(instance.coords)) + # interpolated_instance = interp_utils.interp_arc(t=self.fixed_num, points=instance_array) + distances = np.linspace(0, instance.length, self.fixed_num) + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]) + if instance.has_z: + sampled_points = sampled_points.reshape(-1,3) + else: + sampled_points = sampled_points.reshape(-1,2) + # import pdb;pdb.set_trace() + instance_points_list.append(sampled_points) + instance_points_array = np.array(instance_points_list) + instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + return instance_points_tensor + + @property + def fixed_num_sampled_points_ambiguity(self): + """ + return torch.Tensor([N,fixed_num,3]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + if instance.has_z: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 3) + else: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + instance_points_list.append(sampled_points) + instance_points_array = np.array(instance_points_list) + instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + instance_points_tensor = instance_points_tensor if is_3d else instance_points_tensor[:,:,:2] + instance_points_tensor = instance_points_tensor.unsqueeze(1) + return instance_points_tensor + + @property + def fixed_num_sampled_points_torch(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + # distances = np.linspace(0, instance.length, self.fixed_num) + # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + poly_pts = to_tensor(np.array(list(instance.coords))) + poly_pts = poly_pts.unsqueeze(0).permute(0,2,1) + sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True) + sampled_pts = sampled_pts.permute(0,2,1).squeeze(0) + instance_points_list.append(sampled_pts) + # instance_points_array = np.array(instance_points_list) + # instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = torch.stack(instance_points_list,dim=0) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + return instance_points_tensor + + @property + def shift_fixed_num_sampled_points(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + fixed_num = fixed_num_pts.shape[0] + shift_pts_list = [] + if is_poly: + # import pdb;pdb.set_trace() + for shift_right_i in range(fixed_num): + shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,shift_pts.shape[-1]], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v1(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + pts_num = fixed_num_pts.shape[0] + shift_num = pts_num - 1 + if is_poly: + pts_to_shift = fixed_num_pts[:-1,:] + shift_pts_list = [] + if is_poly: + for shift_right_i in range(shift_num): + shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + if is_poly: + _, _, num_coords = shift_pts.shape + tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords)) + tmp_shift_pts[:,:-1,:] = shift_pts + tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] + shift_pts = tmp_shift_pts + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([shift_num-shift_pts.shape[0],pts_num,shift_pts.shape[-1]], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v2(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + assert len(self.instance_list) != 0 + instances_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + poly_pts = np.array(list(instance.coords)) + start_pts = poly_pts[0] + end_pts = poly_pts[-1] + is_poly = np.equal(start_pts, end_pts) + is_poly = is_poly.all() + shift_pts_list = [] + pts_num, coords_num = poly_pts.shape + shift_num = pts_num - 1 + final_shift_num = self.fixed_num - 1 + if is_poly: + pts_to_shift = poly_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num) + shift_pts_list.append(shift_sampled_points) + # import pdb;pdb.set_trace() + else: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num) + flip_sampled_points = np.flip(sampled_points, axis=0) + shift_pts_list.append(sampled_points) + shift_pts_list.append(flip_sampled_points) + + multi_shifts_pts = np.stack(shift_pts_list,axis=0) + shifts_num,_,_ = multi_shifts_pts.shape + + if shifts_num > final_shift_num: + index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False) + multi_shifts_pts = multi_shifts_pts[index] + + multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) + multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( + dtype=torch.float32) + + multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) + multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) + # if not is_poly: + if multi_shifts_pts_tensor.shape[0] < final_shift_num: + padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,multi_shifts_pts_tensor.shape[-1]], self.padding_value) + multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) + instances_list.append(multi_shifts_pts_tensor) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v3(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + assert len(self.instance_list) != 0 + instances_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + poly_pts = np.array(list(instance.coords)) + start_pts = poly_pts[0] + end_pts = poly_pts[-1] + is_poly = np.equal(start_pts, end_pts) + is_poly = is_poly.all() + shift_pts_list = [] + pts_num, coords_num = poly_pts.shape + shift_num = pts_num - 1 + final_shift_num = self.fixed_num - 1 + if is_poly: + pts_to_shift = poly_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num) + shift_pts_list.append(shift_sampled_points) + flip_pts_to_shift = np.flip(pts_to_shift, axis=0) + for shift_right_i in range(shift_num): + shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num) + shift_pts_list.append(shift_sampled_points) + # import pdb;pdb.set_trace() + else: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num) + flip_sampled_points = np.flip(sampled_points, axis=0) + shift_pts_list.append(sampled_points) + shift_pts_list.append(flip_sampled_points) + + multi_shifts_pts = np.stack(shift_pts_list,axis=0) + shifts_num,_,_ = multi_shifts_pts.shape + # import pdb;pdb.set_trace() + if shifts_num > 2*final_shift_num: + index = np.random.choice(shift_num, final_shift_num, replace=False) + flip0_shifts_pts = multi_shifts_pts[index] + flip1_shifts_pts = multi_shifts_pts[index+shift_num] + multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0) + + multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) + multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( + dtype=torch.float32) + + multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) + multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) + # if not is_poly: + if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num: + padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,multi_shifts_pts_tensor.shape[-1]], self.padding_value) + multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) + instances_list.append(multi_shifts_pts_tensor) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v4(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + pts_num = fixed_num_pts.shape[0] + shift_num = pts_num - 1 + shift_pts_list = [] + if is_poly: + pts_to_shift = fixed_num_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) + flip_pts_to_shift = pts_to_shift.flip(0) + for shift_right_i in range(shift_num): + shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + if is_poly: + _, _, num_coords = shift_pts.shape + tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords)) + tmp_shift_pts[:,:-1,:] = shift_pts + tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] + shift_pts = tmp_shift_pts + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,shift_pts.shape[-1]], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_torch(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points_torch + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + fixed_num = fixed_num_pts.shape[0] + shift_pts_list = [] + if is_poly: + # import pdb;pdb.set_trace() + for shift_right_i in range(fixed_num): + shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,shift_pts.shape[-1]], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + # @property + # def polyline_points(self): + # """ + # return [[x0,y0],[x1,y1],...] + # """ + # assert len(self.instance_list) != 0 + # for instance in self.instance_list: + + +class VectorizedAV2LocalMap(object): + CLASS2LABEL = { + 'road_divider': 0, + 'lane_divider': 0, + 'divider': 0, + 'ped_crossing': 1, + 'boundary': 2, + 'others': -1 + } + def __init__(self, + dataroot, + patch_size, + test_mode=False, + map_classes=('divider','ped_crossing','boundary'), + line_classes=('road_divider', 'lane_divider'), + ped_crossing_classes=('ped_crossing'), + contour_classes=('road_segment', 'lane'), + sample_dist=1, + num_samples=250, + padding=False, + fixed_ptsnum_per_line=-1, + padding_value=-10000,): + ''' + Args: + fixed_ptsnum_per_line = -1 : no fixed num + ''' + super().__init__() + # self.data_root = dataroot + self.test_mode = test_mode + if self.test_mode: + self.data_root = osp.join(dataroot, "val") + else: + self.data_root = osp.join(dataroot, "train") + + self.loader = AV2SensorDataLoader(data_dir=Path(dataroot), labels_dir=Path(dataroot)) + + + + self.vec_classes = map_classes + self.line_classes = line_classes + self.ped_crossing_classes = ped_crossing_classes + self.polygon_classes = contour_classes + + + self.patch_size = patch_size + self.sample_dist = sample_dist + self.num_samples = num_samples + self.padding = padding + self.fixed_num = fixed_ptsnum_per_line + self.padding_value = padding_value + + def gen_vectorized_samples(self, location, map_elements, lidar2global_translation, lidar2global_rotation): + ''' + use lidar2global to get gt map layers + av2 lidar2global the same as ego2global + location the same as log_id + ''' + # avm = ArgoverseStaticMap.from_map_dir(log_map_dirpath, build_raster=False) + + map_pose = lidar2global_translation[:2] + rotation = Quaternion._from_matrix(lidar2global_rotation) + + patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1]) + patch_angle = quaternion_yaw(rotation) / np.pi * 180 + # import pdb;pdb.set_trace() + vectors = [] + city_SE2_ego = SE3(lidar2global_rotation, lidar2global_translation) + ego_SE3_city = city_SE2_ego.inverse() + for vec_class in self.vec_classes: + if vec_class == 'divider': + line_geom = self.get_map_divider_geom(patch_box, patch_angle, map_elements[vec_class], ego_SE3_city) + line_instances_list = self.line_geoms_to_instances(line_geom) + for divider in line_instances_list: + vectors.append((divider, self.CLASS2LABEL.get('divider', -1))) + elif vec_class == 'ped_crossing': + ped_geom = self.get_map_ped_geom(patch_box, patch_angle, map_elements[vec_class], ego_SE3_city) + ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom) + for instance in ped_instance_list: + vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1))) + elif vec_class == 'boundary': + polygon_geom = self.get_map_boundary_geom(patch_box, patch_angle, map_elements[vec_class], ego_SE3_city) + poly_bound_list = self.bound_poly_geoms_to_instances(polygon_geom) + for bound in poly_bound_list: + vectors.append((bound, self.CLASS2LABEL.get('boundary', -1))) + else: + raise ValueError(f'WRONG vec_class: {vec_class}') + + # filter out -1 + filtered_vectors = [] + gt_pts_loc_3d = [] + gt_pts_num_3d = [] + gt_labels = [] + gt_instance = [] + for instance, type in vectors: + if type != -1: + gt_instance.append(instance) + gt_labels.append(type) + # import pdb;pdb.set_trace() + gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist, + self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size) + + anns_results = dict( + gt_vecs_pts_loc=gt_instance, + gt_vecs_label=gt_labels, + + ) + # import pdb;pdb.set_trace() + return anns_results + def proc_polygon(self, polygon, ego_SE3_city): + # import pdb;pdb.set_trace() + interiors = [] + exterior_cityframe = np.array(list(polygon.exterior.coords)) + exterior_egoframe = ego_SE3_city.transform_point_cloud(exterior_cityframe) + for inter in polygon.interiors: + inter_cityframe = np.array(list(inter.coords)) + inter_egoframe = ego_SE3_city.transform_point_cloud(inter_cityframe) + interiors.append(inter_egoframe[:,:2]) + + new_polygon = Polygon(exterior_egoframe[:,:2], interiors) + return new_polygon + + def get_map_boundary_geom(self, patch_box, patch_angle, avm, ego_SE3_city): + map_boundary_geom = [] + patch_x = patch_box[0] + patch_y = patch_box[1] + patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle) + # import pdb;pdb.set_trace() + polygon_list = [] + for da in avm: + exterior_coords = da + # import pdb;pdb.set_trace() + interiors = [] + # import pdb;pdb.set_trace() + is_polygon = np.array_equal(exterior_coords[0],exterior_coords[-1]) + if is_polygon: + polygon = Polygon(exterior_coords, interiors) + else: + polygon = LineString(exterior_coords) + raise ValueError(f'WRONG type: line in boundary') + if is_polygon: + if polygon.is_valid: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + # import pdb;pdb.set_trace() + if new_polygon.geom_type == 'Polygon': + if not new_polygon.is_valid: + continue + new_polygon = self.proc_polygon(new_polygon,ego_SE3_city) + if not new_polygon.is_valid: + continue + elif new_polygon.geom_type == 'MultiPolygon': + polygons = [] + for single_polygon in new_polygon.geoms: + if not single_polygon.is_valid or single_polygon.is_empty: + continue + new_single_polygon = self.proc_polygon(single_polygon,ego_SE3_city) + if not new_single_polygon.is_valid: + continue + polygons.append(new_single_polygon) + if len(polygons) == 0: + continue + new_polygon = MultiPolygon(polygons) + if not new_polygon.is_valid: + continue + else: + raise ValueError('{} is not valid'.format(new_polygon.geom_type)) + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + else: + raise ValueError(f'WRONG type: line in boundary') + map_boundary_geom.append(('boundary',polygon_list)) + return map_boundary_geom + + def get_map_ped_geom(self, patch_box, patch_angle, avm, ego_SE3_city): + map_ped_geom = [] + patch_x = patch_box[0] + patch_y = patch_box[1] + patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle) + # import pdb;pdb.set_trace() + polygon_list = [] + for pc in avm: + exterior_coords = pc + interiors = [] + polygon = Polygon(exterior_coords, interiors) + if polygon.is_valid: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + if new_polygon.geom_type == 'Polygon': + if not new_polygon.is_valid: + continue + new_polygon = self.proc_polygon(new_polygon,ego_SE3_city) + if not new_polygon.is_valid: + continue + elif new_polygon.geom_type == 'MultiPolygon': + polygons = [] + for single_polygon in new_polygon.geoms: + if not single_polygon.is_valid or single_polygon.is_empty: + continue + new_single_polygon = self.proc_polygon(single_polygon,ego_SE3_city) + if not new_single_polygon.is_valid: + continue + polygons.append(new_single_polygon) + if len(polygons) == 0: + continue + new_polygon = MultiPolygon(polygons) + if not new_polygon.is_valid: + continue + else: + raise ValueError('{} is not valid'.format(new_polygon.geom_type)) + + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + map_ped_geom.append(('ped_crossing',polygon_list)) + return map_ped_geom + + def proc_line(self, line,ego_SE3_city): + # import pdb;pdb.set_trace() + new_line_pts_cityframe = np.array(list(line.coords)) + new_line_pts_egoframe = ego_SE3_city.transform_point_cloud(new_line_pts_cityframe) + line = LineString(new_line_pts_egoframe[:,:2]) #TODO + return line + + def get_map_divider_geom(self, patch_box, patch_angle, avm, ego_SE3_city): + map_divider_geom = [] + patch_x = patch_box[0] + patch_y = patch_box[1] + patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle) + line_list = [] + for ls in avm: + line = LineString(ls) + if line.is_empty: # Skip lines without nodes. + continue + new_line = line.intersection(patch) + if not new_line.is_empty: + # import pdb;pdb.set_trace() + if new_line.geom_type == 'MultiLineString': + for single_line in new_line.geoms: + if single_line.is_empty: + continue + + single_line = self.proc_line(single_line,ego_SE3_city) + line_list.append(single_line) + else: + new_line = self.proc_line(new_line, ego_SE3_city) + line_list.append(new_line) + map_divider_geom.append(('divider',line_list)) + return map_divider_geom + + + def _one_type_line_geom_to_instances(self, line_geom): + line_instances = [] + + for line in line_geom: + if not line.is_empty: + if line.geom_type == 'MultiLineString': + for single_line in line.geoms: + line_instances.append(single_line) + elif line.geom_type == 'LineString': + line_instances.append(line) + else: + raise NotImplementedError + return line_instances + + + def ped_poly_geoms_to_instances(self, ped_geom): + ped = ped_geom[0][1] + # union_segments = ops.unary_union(ped) + # union_segments = MultiPolygon(ped) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + # local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2) + exteriors = [] + interiors = [] + # if union_segments.geom_type != 'MultiPolygon': + # union_segments = MultiPolygon([union_segments]) + for segments in ped: + if segments.geom_type != 'MultiPolygon': + segments = MultiPolygon([segments]) + for poly in segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_instances(results) + + def bound_poly_geoms_to_instances(self, polygon_geom): + # roads = polygon_geom[0][1] + # lanes = polygon_geom[1][1] + # union_roads = ops.unary_union(roads) + # union_lanes = ops.unary_union(lanes) + # union_segments = ops.unary_union([union_roads, union_lanes]) + # import pdb;pdb.set_trace() + bounds = polygon_geom[0][1] + + union_segments = ops.unary_union(bounds) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + exteriors = [] + interiors = [] + if union_segments.geom_type != 'MultiPolygon': + union_segments = MultiPolygon([union_segments]) + for poly in union_segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_instances(results) + + def line_geoms_to_instances(self, line_geom): + lines = line_geom[0][1] + multiline = MultiLineString(lines) + union_lines = ops.unary_union(multiline) + if union_lines.geom_type == 'LineString': + return self._one_type_line_geom_to_instances([union_lines]) + before_num = len(union_lines.geoms) + # import pdb;pdb.set_trace() + merged_lines = ops.linemerge(union_lines) + if merged_lines.geom_type == 'LineString': + return self._one_type_line_geom_to_instances([merged_lines]) + after_num = len(merged_lines.geoms) + # import pdb;pdb.set_trace() + while after_num != before_num: + before_num = len(merged_lines.geoms) + merged_lines = ops.unary_union(merged_lines) + if merged_lines.geom_type == 'LineString': + break + merged_lines = ops.linemerge(merged_lines) + if merged_lines.geom_type == 'LineString': + break + after_num = len(merged_lines.geoms) + + return self._one_type_line_geom_to_instances([merged_lines]) + + + + def sample_pts_from_line(self, line): + if self.fixed_num < 0: + distances = np.arange(0, line.length, self.sample_dist) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + else: + # fixed number of points, so distance is line.length / self.fixed_num + distances = np.linspace(0, line.length, self.fixed_num) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + + + num_valid = len(sampled_points) + + if not self.padding or self.fixed_num > 0: + # fixed num sample can return now! + return sampled_points, num_valid + + # fixed distance sampling need padding! + num_valid = len(sampled_points) + + if self.fixed_num < 0: + if num_valid < self.num_samples: + padding = np.zeros((self.num_samples - len(sampled_points), 2)) + sampled_points = np.concatenate([sampled_points, padding], axis=0) + else: + sampled_points = sampled_points[:self.num_samples, :] + num_valid = self.num_samples + + + return sampled_points, num_valid + + +@DATASETS.register_module() +class CustomAV2LocalMapDataset(CustomNuScenesDataset): + r"""NuScenes Dataset. + + This datset add static map elements + """ + MAPCLASSES = ('divider',) + def __init__(self, + map_ann_file=None, + queue_length=4, + code_size=2, + bev_size=(200, 200), + pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0), + overlap_test=False, + fixed_ptsnum_per_line=-1, + eval_use_same_gt_sample_num_flag=False, + padding_value=-10000, + map_classes=None, + *args, + **kwargs): + super().__init__(*args, **kwargs) + self.map_ann_file = map_ann_file + + self.code_size = code_size + self.queue_length = queue_length + self.overlap_test = overlap_test + self.bev_size = bev_size + + self.MAPCLASSES = self.get_map_classes(map_classes) + self.NUM_MAPCLASSES = len(self.MAPCLASSES) + self.pc_range = pc_range + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + self.patch_size = (patch_h, patch_w) + self.padding_value = padding_value + self.fixed_num = fixed_ptsnum_per_line + self.eval_use_same_gt_sample_num_flag = eval_use_same_gt_sample_num_flag + self.vector_map = VectorizedAV2LocalMap(kwargs['data_root'], + patch_size=self.patch_size, test_mode=self.test_mode, + map_classes=self.MAPCLASSES, + fixed_ptsnum_per_line=fixed_ptsnum_per_line, + padding_value=self.padding_value) + self.is_vis_on_test = False + + def load_annotations(self, ann_file): + """Load annotations from ann_file. + + Args: + ann_file (str): Path of the annotation file. + + Returns: + list[dict]: List of annotations sorted by timestamps. + """ + # import pdb;pdb.set_trace() + data = mmcv.load(ann_file) + # import pdb;pdb.set_trace() + data_infos = list(sorted(data['samples'], key=lambda e: e['timestamp'])) + data_infos = data_infos[::self.load_interval] + # data_infos = [ data_info.update(dict(token= str(data_info['timestamp']+data_info['log_id']))) for data_info in data_infos] + self.id2map = data['id2map'] + self.metadata = None + self.version = None + return data_infos + + @classmethod + def get_map_classes(cls, map_classes=None): + """Get class names of current dataset. + + Args: + classes (Sequence[str] | str | None): If classes is None, use + default CLASSES defined by builtin dataset. If classes is a + string, take it as a file name. The file contains the name of + classes where each line contains one class name. If classes is + a tuple or list, override the CLASSES defined by the dataset. + + Return: + list[str]: A list of class names. + """ + if map_classes is None: + return cls.MAPCLASSES + + if isinstance(map_classes, str): + # take it as a file path + class_names = mmcv.list_from_file(map_classes) + elif isinstance(map_classes, (tuple, list)): + class_names = map_classes + else: + raise ValueError(f'Unsupported type {type(map_classes)} of map classes.') + + return class_names + def vectormap_pipeline(self, example, input_dict): + ''' + `example` type: + keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img'; + all keys type is 'DataContainer'; + 'img_metas' cpu_only=True, type is dict, others are false; + 'gt_labels_3d' shape torch.size([num_samples]), stack=False, + padding_value=0, cpu_only=False + 'gt_bboxes_3d': stack=False, cpu_only=True + ''' + # import pdb;pdb.set_trace() + location = input_dict['log_id'] + e2g_translation = input_dict['e2g_translation'] + e2g_rotation = input_dict['e2g_rotation'] + map_elements = self.id2map[location] + anns_results = self.vector_map.gen_vectorized_samples(location, map_elements, e2g_translation, e2g_rotation) + + ''' + anns_results, type: dict + 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates + 'gt_vecs_pts_num': list[num_vecs], vec with num_points + 'gt_vecs_label': list[num_vecs], vec with cls index + ''' + gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) + if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): + gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] + else: + gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) + try: + gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) + except Exception as e: + # empty tensor, will be passed in train, + # but we preserve it for test + + # import pdb;pdb.set_trace() + gt_vecs_pts_loc = gt_vecs_pts_loc + # import ipdb;ipdb.set_trace() + example['gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False) + example['gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True) + # import pdb;pdb.set_trace() + # if self.is_vis_on_test: + # lidar2global_translation = to_tensor(lidar2global_translation) + # example['lidar2global_translation'] = DC(lidar2global_translation, cpu_only=True) + # else: + # example['img_metas'].data['lidar2global_translation'] = lidar2global_translation + return example + + def prepare_train_data(self, index): + """ + Training data preparation. + Args: + index (int): Index for accessing the target data. + Returns: + dict: Training data dict of the corresponding index. + """ + data_queue = [] + + # temporal aug + prev_indexs_list = list(range(index-self.queue_length, index)) + random.shuffle(prev_indexs_list) + prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True) + ## + + input_dict = self.get_data_info(index) + if input_dict is None: + return None + frame_idx = input_dict['timestamp'] + scene_token = input_dict['log_id'] + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + # import pdb;pdb.set_trace() + example = self.vectormap_pipeline(example,input_dict) + if self.filter_empty_gt and \ + (example is None or ~(example['gt_labels_3d']._data != -1).any()): + return None + data_queue.insert(0, example) + return self.union2one(data_queue) + + def union2one(self, queue): + """ + convert sample queue into one single sample. + """ + imgs_list = [each['img'].data for each in queue] + metas_map = {} + prev_pos = None + prev_angle = None + for i, each in enumerate(queue): + metas_map[i] = each['img_metas'].data + if i == 0: + metas_map[i]['prev_bev'] = False + prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] = 0 + metas_map[i]['can_bus'][-1] = 0 + else: + metas_map[i]['prev_bev'] = True + tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] -= prev_pos + metas_map[i]['can_bus'][-1] -= prev_angle + prev_pos = copy.deepcopy(tmp_pos) + prev_angle = copy.deepcopy(tmp_angle) + + queue[-1]['img'] = DC(torch.stack(imgs_list), + cpu_only=False, stack=True) + queue[-1]['img_metas'] = DC(metas_map, cpu_only=True) + queue = queue[-1] + return queue + + def get_data_info(self, index): + """Get data info according to the given index. + + Args: + index (int): Index of the sample data to get. + + Returns: + dict: Data information that will be passed to the data \ + preprocessing pipelines. It includes the following keys: + + - sample_idx (str): Sample index. + - pts_filename (str): Filename of point clouds. + - sweeps (list[dict]): Infos of sweeps. + - timestamp (float): Sample timestamp. + - img_filename (str, optional): Image filename. + - lidar2img (list[np.ndarray], optional): Transformations \ + from lidar to different cameras. + - ann_info (dict): Annotation info. + """ + info = self.data_infos[index] + # standard protocal modified from SECOND.Pytorch + input_dict = dict( + timestamp=info['timestamp'], + e2g_translation=info['e2g_translation'], + e2g_rotation=info['e2g_rotation'], + log_id=info['log_id'], + scene_token=info['log_id'], + ) + if self.modality['use_camera']: + image_paths = [] + cam_intrinsics = [] + lidar2img_rts = [] + lidar2cam_rts = [] + cam_types = [] + for cam_type, cam_info in info['cams'].items(): + image_paths.append(cam_info['img_fpath']) + # camera intrinsics + camera_intrinsics = np.eye(4).astype(np.float32) + camera_intrinsics[:3, :3] = cam_info["intrinsics"] + # input_dict["camera_intrinsics"].append(camera_intrinsics) + + # ego2img, ego = lidar + lidar2cam_rt = cam_info['extrinsics'] + intrinsic = cam_info['intrinsics'] + viewpad = np.eye(4) + viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic + lidar2img_rt = (viewpad @ lidar2cam_rt) + lidar2img_rts.append(lidar2img_rt) + lidar2cam_rts.append(lidar2cam_rt) + cam_intrinsics.append(viewpad) + cam_types.append(cam_type) + + + + input_dict.update( + dict( + img_filename=image_paths, + lidar2img=lidar2img_rts, + cam_intrinsic=cam_intrinsics, + lidar2cam=lidar2cam_rts, + cam_type=cam_types, + )) + + if not self.test_mode: + # annos = self.get_ann_info(index) + input_dict['ann_info'] = dict() + + translation = input_dict['e2g_translation'] + can_bus = np.ones(18) + # can_bus.extend(translation.tolist()) + can_bus[:3] = translation + rotation = Quaternion._from_matrix(input_dict['e2g_rotation']) + can_bus[3:7] = rotation + patch_angle = quaternion_yaw(rotation) / np.pi * 180 + if patch_angle < 0: + patch_angle += 360 + can_bus[-2] = patch_angle / 180 * np.pi + can_bus[-1] = patch_angle + input_dict['can_bus'] = can_bus + # import pdb;pdb.set_trace() + return input_dict + + def prepare_test_data(self, index): + """Prepare data for testing. + + Args: + index (int): Index for accessing the target data. + + Returns: + dict: Testing data dict of the corresponding index. + """ + input_dict = self.get_data_info(index) + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + if self.is_vis_on_test: + example = self.vectormap_pipeline(example, input_dict) + return example + + def __getitem__(self, idx): + """Get item from infos according to the given index. + Returns: + dict: Data dictionary of the corresponding index. + """ + if self.test_mode: + return self.prepare_test_data(idx) + while True: + + data = self.prepare_train_data(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + def _format_gt(self): + gt_annos = [] + # import pdb;pdb.set_trace() + print('Start to convert gt map format...') + assert self.map_ann_file is not None + if (not os.path.exists(self.map_ann_file)) : + dataset_length = len(self) + prog_bar = mmcv.ProgressBar(dataset_length) + mapped_class_names = self.MAPCLASSES + for sample_id in range(dataset_length): + sample_token = self.data_infos[sample_id]['token'] + gt_anno = {} + gt_anno['sample_token'] = sample_token + # gt_sample_annos = [] + gt_sample_dict = {} + gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id]) + gt_labels = gt_sample_dict['gt_labels_3d'].data.numpy() + gt_vecs = gt_sample_dict['gt_bboxes_3d'].data.instance_list + # import pdb;pdb.set_trace() + gt_vec_list = [] + for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)): + name = mapped_class_names[gt_label] + anno = dict( + pts=np.array(list(gt_vec.coords))[:,:self.code_size], + pts_num=len(list(gt_vec.coords)), + cls_name=name, + type=gt_label, + ) + gt_vec_list.append(anno) + gt_anno['vectors']=gt_vec_list + gt_annos.append(gt_anno) + + prog_bar.update() + nusc_submissions = { + 'GTs': gt_annos + } + print('\n GT anns writes to', self.map_ann_file) + mmcv.dump(nusc_submissions, self.map_ann_file) + else: + print(f'{self.map_ann_file} exist, not update') + + def _format_bbox(self, results, jsonfile_prefix=None): + """Convert the results to the standard format. + + Args: + results (list[dict]): Testing results of the dataset. + jsonfile_prefix (str): The prefix of the output jsonfile. + You can specify the output directory/filename by + modifying the jsonfile_prefix. Default: None. + + Returns: + str: Path of the output json file. + """ + assert self.map_ann_file is not None + pred_annos = [] + mapped_class_names = self.MAPCLASSES + # import pdb;pdb.set_trace() + print('Start to convert map detection format...') + for sample_id, det in enumerate(mmcv.track_iter_progress(results)): + pred_anno = {} + vecs = output_to_vecs(det) + sample_token = self.data_infos[sample_id]['token'] + pred_anno['sample_token'] = sample_token + pred_vec_list=[] + for i, vec in enumerate(vecs): + name = mapped_class_names[vec['label']] + anno = dict( + # sample_token=sample_token, + pts=vec['pts'], + pts_num=len(vec['pts']), + cls_name=name, + type=vec['label'], + confidence_level=vec['score']) + pred_vec_list.append(anno) + # annos.append(nusc_anno) + # nusc_annos[sample_token] = annos + pred_anno['vectors'] = pred_vec_list + pred_annos.append(pred_anno) + + + if not os.path.exists(self.map_ann_file): + self._format_gt() + else: + print(f'{self.map_ann_file} exist, not update') + # with open(self.map_ann_file,'r') as f: + # GT_anns = json.load(f) + # gt_annos = GT_anns['GTs'] + nusc_submissions = { + 'meta': self.modality, + 'results': pred_annos, + # 'GTs': gt_annos + } + + mmcv.mkdir_or_exist(jsonfile_prefix) + res_path = osp.join(jsonfile_prefix, 'nuscmap_results.json') + print('Results writes to', res_path) + mmcv.dump(nusc_submissions, res_path) + return res_path + + def to_gt_vectors(self, + gt_dict): + # import pdb;pdb.set_trace() + gt_labels = gt_dict['gt_labels_3d'].data + gt_instances = gt_dict['gt_bboxes_3d'].data.instance_list + + gt_vectors = [] + + for gt_instance, gt_label in zip(gt_instances, gt_labels): + pts, pts_num = sample_pts_from_line(gt_instance, patch_size=self.patch_size) + gt_vectors.append({ + 'pts': pts, + 'pts_num': pts_num, + 'type': int(gt_label) + }) + vector_num_list = {} + for i in range(self.NUM_MAPCLASSES): + vector_num_list[i] = [] + for vec in gt_vectors: + if vector['pts_num'] >= 2: + vector_num_list[vector['type']].append((LineString(vector['pts'][:vector['pts_num']]), vector.get('confidence_level', 1))) + return gt_vectors + + def _evaluate_single(self, + result_path, + logger=None, + metric='chamfer', + result_name='pts_bbox'): + """Evaluation for a single model in nuScenes protocol. + + Args: + result_path (str): Path of the result file. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + metric (str): Metric name used for evaluation. Default: 'bbox'. + result_name (str): Result name in the metric prefix. + Default: 'pts_bbox'. + + Returns: + dict: Dictionary of evaluation details. + """ + from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import eval_map + from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import format_res_gt_by_classes + result_path = osp.abspath(result_path) + # import pdb;pdb.set_trace() + detail = dict() + + print('Formating results & gts by classes') + with open(result_path,'r') as f: + pred_results = json.load(f) + gen_results = pred_results['results'] + with open(self.map_ann_file,'r') as ann_f: + gt_anns = json.load(ann_f) + annotations = gt_anns['GTs'] + cls_gens, cls_gts = format_res_gt_by_classes(result_path, + gen_results, + annotations, + cls_names=self.MAPCLASSES, + num_pred_pts_per_instance=self.fixed_num, + eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag, + pc_range=self.pc_range) + + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['chamfer', 'iou'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + for metric in metrics: + print('-*'*10+f'use metric:{metric}'+'-*'*10) + + if metric == 'chamfer': + thresholds = [0.5,1.0,1.5] + elif metric == 'iou': + thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES)) + + for i, thr in enumerate(thresholds): + print('-*'*10+f'threshhold:{thr}'+'-*'*10) + mAP, cls_ap = eval_map( + gen_results, + annotations, + cls_gens, + cls_gts, + threshold=thr, + cls_names=self.MAPCLASSES, + logger=logger, + num_pred_pts_per_instance=self.fixed_num, + pc_range=self.pc_range, + metric=metric) + for j in range(self.NUM_MAPCLASSES): + cls_aps[i, j] = cls_ap[j]['ap'] + + for i, name in enumerate(self.MAPCLASSES): + print('{}: {}'.format(name, cls_aps.mean(0)[i])) + detail['AV2Map_{}/{}_AP'.format(metric,name)] = cls_aps.mean(0)[i] + print('map: {}'.format(cls_aps.mean(0).mean())) + detail['AV2Map_{}/mAP'.format(metric)] = cls_aps.mean(0).mean() + + for i, name in enumerate(self.MAPCLASSES): + for j, thr in enumerate(thresholds): + if metric == 'chamfer': + detail['AV2Map_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] + elif metric == 'iou': + if thr == 0.5 or thr == 0.75: + detail['AV2Map_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] + + return detail + + + def evaluate(self, + results, + metric='bbox', + logger=None, + jsonfile_prefix=None, + result_names=('pts_bbox'), + show=False, + out_dir=None, + pipeline=None): + """Evaluation in nuScenes protocol. + + Args: + results (list[dict]): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + show (bool): Whether to visualize. + Default: False. + out_dir (str): Path to save the visualization results. + Default: None. + pipeline (list[dict], optional): raw data loading for showing. + Default: None. + + Returns: + dict[str, float]: Results of each evaluation metric. + """ + result_files, tmp_dir = self.format_results(results, jsonfile_prefix) + + if isinstance(result_files, dict): + results_dict = dict() + for name in result_names: + print('Evaluating bboxes of {}'.format(name)) + ret_dict = self._evaluate_single(result_files[name], metric=metric) + results_dict.update(ret_dict) + elif isinstance(result_files, str): + results_dict = self._evaluate_single(result_files, metric=metric) + + if tmp_dir is not None: + tmp_dir.cleanup() + + if show: + self.show(results, out_dir, pipeline=pipeline) + return results_dict + + +def output_to_vecs(detection): + box3d = detection['boxes_3d'].numpy() + scores = detection['scores_3d'].numpy() + labels = detection['labels_3d'].numpy() + pts = detection['pts_3d'].numpy() + + vec_list = [] + # import pdb;pdb.set_trace() + for i in range(box3d.shape[0]): + vec = dict( + bbox = box3d[i], # xyxy + label=labels[i], + score=scores[i], + pts=pts[i], + ) + vec_list.append(vec) + return vec_list + +def sample_pts_from_line(line, + fixed_num=-1, + sample_dist=1, + normalize=False, + patch_size=None, + padding=False, + num_samples=250,): + if fixed_num < 0: + distances = np.arange(0, line.length, sample_dist) + if line.has_z: + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 3) + else: + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + else: + # fixed number of points, so distance is line.length / fixed_num + distances = np.linspace(0, line.length, fixed_num) + if line.has_z: + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 3) + else: + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + + if normalize: + sampled_points[:,:2] = sampled_points[:,:2] / np.array([patch_size[1], patch_size[0]]) + + num_valid = len(sampled_points) + + if not padding or fixed_num > 0: + # fixed num sample can return now! + return sampled_points, num_valid + + # fixed distance sampling need padding! + num_valid = len(sampled_points) + + if fixed_num < 0: + if num_valid < num_samples: + padding = np.zeros((num_samples - len(sampled_points), sampled_points.shape[-1])) + sampled_points = np.concatenate([sampled_points, padding], axis=0) + else: + sampled_points = sampled_points[:num_samples, :] + num_valid = num_samples + + if normalize: + sampled_points[:,:2] = sampled_points[:,:2] / np.array([patch_size[1], patch_size[0]]) + num_valid = len(sampled_points) + + return sampled_points[:,:2], num_valid \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/builder.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..ba09708be83bca72fcbffa6ad057f7d1bbc59f3b --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/builder.py @@ -0,0 +1,146 @@ + +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import platform +import random +from functools import partial + +import numpy as np +from mmcv.parallel import collate +from mmcv.runner import get_dist_info +from mmcv.utils import Registry, build_from_cfg +from torch.utils.data import DataLoader + +from mmdet.datasets.samplers import GroupSampler +from projects.mmdet3d_plugin.datasets.samplers.group_sampler import DistributedGroupSampler +from projects.mmdet3d_plugin.datasets.samplers.distributed_sampler import DistributedSampler +from projects.mmdet3d_plugin.datasets.samplers.sampler import build_sampler + +def build_dataloader(dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + shuffle=True, + seed=None, + shuffler_sampler=None, + nonshuffler_sampler=None, + **kwargs): + """Build PyTorch DataLoader. + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + Args: + dataset (Dataset): A PyTorch dataset. + samples_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + num_gpus (int): Number of GPUs. Only used in non-distributed training. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + kwargs: any keyword argument to be used to initialize DataLoader + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + if dist: + # DistributedGroupSampler will definitely shuffle the data to satisfy + # that images on each GPU are in the same group + if shuffle: + sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'), + dict( + dataset=dataset, + samples_per_gpu=samples_per_gpu, + num_replicas=world_size, + rank=rank, + seed=seed) + ) + + else: + sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'), + dict( + dataset=dataset, + num_replicas=world_size, + rank=rank, + shuffle=shuffle, + seed=seed) + ) + + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + # assert False, 'not support in bevformer' + print('WARNING!!!!, Only can be used for obtain inference speed!!!!') + sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + init_fn = partial( + worker_init_fn, num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None + + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + pin_memory=True, #3 + worker_init_fn=init_fn, + **kwargs) + + return data_loader + + +def worker_init_fn(worker_id, num_workers, rank, seed): + # The seed of each worker equals to + # num_worker * rank + worker_id + user_seed + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) + + +# Copyright (c) OpenMMLab. All rights reserved. +import platform +from mmcv.utils import Registry, build_from_cfg + +from mmdet.datasets import DATASETS +from mmdet.datasets.builder import _concat_dataset + +if platform.system() != 'Windows': + # https://github.com/pytorch/pytorch/issues/973 + import resource + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + base_soft_limit = rlimit[0] + hard_limit = rlimit[1] + soft_limit = min(max(4096, base_soft_limit), hard_limit) + resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) + +OBJECTSAMPLERS = Registry('Object sampler') + + +def custom_build_dataset(cfg, default_args=None): + from mmdet3d.datasets.dataset_wrappers import CBGSDataset + from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset, + ConcatDataset, RepeatDataset) + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg]) + elif cfg['type'] == 'ConcatDataset': + dataset = ConcatDataset( + [custom_build_dataset(c, default_args) for c in cfg['datasets']], + cfg.get('separate_eval', True)) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + custom_build_dataset(cfg['dataset'], default_args), cfg['times']) + elif cfg['type'] == 'ClassBalancedDataset': + dataset = ClassBalancedDataset( + custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr']) + elif cfg['type'] == 'CBGSDataset': + dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args)) + elif isinstance(cfg.get('ann_file'), (list, tuple)): + dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..82db209d7a4017ee5cc9740180931416a8724f85 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/__init__.py @@ -0,0 +1 @@ +# from .CD_loss import MyChamferDistance \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py new file mode 100644 index 0000000000000000000000000000000000000000..c868be978539aa4b0c676b9bc3fddf98766b6083 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py @@ -0,0 +1,350 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from multiprocessing import Pool +from shapely.geometry import LineString, Polygon +import mmcv +import numpy as np +from mmcv.utils import print_log +from terminaltables import AsciiTable +import json +from os import path as osp +import os +from functools import partial +from .tpfp import custom_tpfp_gen + +def average_precision(recalls, precisions, mode='area'): + """Calculate average precision (for single or multiple scales). + + Args: + recalls (ndarray): shape (num_scales, num_dets) or (num_dets, ) + precisions (ndarray): shape (num_scales, num_dets) or (num_dets, ) + mode (str): 'area' or '11points', 'area' means calculating the area + under precision-recall curve, '11points' means calculating + the average precision of recalls at [0, 0.1, ..., 1] + + Returns: + float or ndarray: calculated average precision + """ + no_scale = False + if recalls.ndim == 1: + no_scale = True + recalls = recalls[np.newaxis, :] + precisions = precisions[np.newaxis, :] + assert recalls.shape == precisions.shape and recalls.ndim == 2 + num_scales = recalls.shape[0] + ap = np.zeros(num_scales, dtype=np.float32) + if mode == 'area': + zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) + ones = np.ones((num_scales, 1), dtype=recalls.dtype) + mrec = np.hstack((zeros, recalls, ones)) + mpre = np.hstack((zeros, precisions, zeros)) + for i in range(mpre.shape[1] - 1, 0, -1): + mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) + for i in range(num_scales): + ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] + ap[i] = np.sum( + (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) + elif mode == '11points': + for i in range(num_scales): + for thr in np.arange(0, 1 + 1e-3, 0.1): + precs = precisions[i, recalls[i, :] >= thr] + prec = precs.max() if precs.size > 0 else 0 + ap[i] += prec + ap /= 11 + else: + raise ValueError( + 'Unrecognized mode, only "area" and "11points" are supported') + if no_scale: + ap = ap[0] + return ap + +def get_cls_results(gen_results, + annotations, + num_sample=100, + num_pred_pts_per_instance=30, + eval_use_same_gt_sample_num_flag=False, + class_id=0, + fix_interval=False): + """Get det results and gt information of a certain class. + + Args: + gen_results (list[list]): Same as `eval_map()`. + annotations (list[dict]): Same as `eval_map()`. + class_id (int): ID of a specific class. + + Returns: + tuple[list[np.ndarray]]: detected bboxes, gt bboxes + """ + # if len(gen_results) == 0 or + + cls_gens, cls_scores = [], [] + for res in gen_results['vectors']: + if res['type'] == class_id: + if len(res['pts']) < 2: + continue + if not eval_use_same_gt_sample_num_flag: + sampled_points = np.array(res['pts']) + else: + line = res['pts'] + line = LineString(line) + + if fix_interval: + distances = list(np.arange(1., line.length, 1.)) + distances = [0,] + distances + [line.length,] + sampled_points = np.array([list(line.interpolate(distance).coords) + for distance in distances]).reshape(-1, 2) + else: + distances = np.linspace(0, line.length, num_sample) + sampled_points = np.array([list(line.interpolate(distance).coords) + for distance in distances]).reshape(-1, 2) + + cls_gens.append(sampled_points) + cls_scores.append(res['confidence_level']) + num_res = len(cls_gens) + if num_res > 0: + cls_gens = np.stack(cls_gens).reshape(num_res,-1) + cls_scores = np.array(cls_scores)[:,np.newaxis] + cls_gens = np.concatenate([cls_gens,cls_scores],axis=-1) + # print(f'for class {i}, cls_gens has shape {cls_gens.shape}') + else: + if not eval_use_same_gt_sample_num_flag: + cls_gens = np.zeros((0,num_pred_pts_per_instance*2+1)) + else: + cls_gens = np.zeros((0,num_sample*2+1)) + # print(f'for class {i}, cls_gens has shape {cls_gens.shape}') + + cls_gts = [] + for ann in annotations['vectors']: + if ann['type'] == class_id: + # line = ann['pts'] + np.array((1,1)) # for hdmapnet + line = ann['pts'] + # line = ann['pts'].cumsum(0) + line = LineString(line) + distances = np.linspace(0, line.length, num_sample) + sampled_points = np.array([list(line.interpolate(distance).coords) + for distance in distances]).reshape(-1, 2) + + cls_gts.append(sampled_points) + num_gts = len(cls_gts) + if num_gts > 0: + cls_gts = np.stack(cls_gts).reshape(num_gts,-1) + else: + cls_gts = np.zeros((0,num_sample*2)) + return cls_gens, cls_gts + # ones = np.ones((num_gts,1)) + # tmp_cls_gens = np.concatenate([cls_gts,ones],axis=-1) + # return tmp_cls_gens, cls_gts + +def format_res_gt_by_classes(result_path, + gen_results, + annotations, + cls_names=None, + num_pred_pts_per_instance=30, + eval_use_same_gt_sample_num_flag=False, + pc_range=(-15.0, -30.0, -5.0, 15.0, 30.0, 3.0), + nproc=24): + assert cls_names is not None + timer = mmcv.Timer() + num_fixed_sample_pts = 100 + fix_interval = False + print('results path: {}'.format(result_path)) + + output_dir = osp.join(*osp.split(result_path)[:-1]) + assert len(gen_results) == len(annotations) + + pool = Pool(nproc) + cls_gens, cls_gts = {}, {} + print('Formatting ...') + formatting_file = 'cls_formatted.pkl' + formatting_file = osp.join(output_dir,formatting_file) + + for i, clsname in enumerate(cls_names): + + gengts = pool.starmap( + partial(get_cls_results, num_sample=num_fixed_sample_pts, + num_pred_pts_per_instance=num_pred_pts_per_instance, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,class_id=i,fix_interval=fix_interval), + zip(gen_results, annotations)) + # gengts = map(partial(get_cls_results, num_sample=num_fixed_sample_pts, class_id=i,fix_interval=fix_interval), + # zip(gen_results, annotations)) + # import pdb;pdb.set_trace() + gens, gts = tuple(zip(*gengts)) + cls_gens[clsname] = gens + cls_gts[clsname] = gts + + mmcv.dump([cls_gens, cls_gts],formatting_file) + print('Cls data formatting done in {:2f}s!! with {}'.format(float(timer.since_start()),formatting_file)) + pool.close() + return cls_gens, cls_gts + +def eval_map(gen_results, + annotations, + cls_gens, + cls_gts, + threshold=0.5, + cls_names=None, + logger=None, + tpfp_fn=None, + pc_range=(-15.0, -30.0, -5.0, 15.0, 30.0, 3.0), + metric=None, + num_pred_pts_per_instance=30, + nproc=24): + timer = mmcv.Timer() + pool = Pool(nproc) + + eval_results = [] + + for i, clsname in enumerate(cls_names): + + # get gt and det bboxes of this class + cls_gen = cls_gens[clsname] + cls_gt = cls_gts[clsname] + # choose proper function according to datasets to compute tp and fp + # XXX + # func_name = cls2func[clsname] + # tpfp_fn = tpfp_fn_dict[tpfp_fn_name] + tpfp_fn = custom_tpfp_gen + # Trick for serialized + # only top-level function can be serized + # somehow use partitial the return function is defined + # at the top level. + + # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold, metric=metric) + # import pdb; pdb.set_trace() + # TODO this is a hack + tpfp_fn = partial(tpfp_fn, threshold=threshold, metric=metric) + args = [] + # compute tp and fp for each image with multiple processes + tpfp = pool.starmap( + tpfp_fn, + zip(cls_gen, cls_gt, *args)) + # import pdb;pdb.set_trace() + tp, fp = tuple(zip(*tpfp)) + + + + # map_results = map( + # tpfp_fn, + # cls_gen, cls_gt) + # tp, fp = tuple(map(list, zip(*map_results))) + + + # debug and testing + # for i in range(len(cls_gen)): + # # print(i) + # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold) + # print(i) + # tpfp = (tpfp,) + # print(tpfp) + # i = 0 + # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold) + # import pdb; pdb.set_trace() + + # XXX + + num_gts = 0 + for j, bbox in enumerate(cls_gt): + num_gts += bbox.shape[0] + + # sort all det bboxes by score, also sort tp and fp + # import pdb;pdb.set_trace() + cls_gen = np.vstack(cls_gen) + num_dets = cls_gen.shape[0] + sort_inds = np.argsort(-cls_gen[:, -1]) #descending, high score front + tp = np.hstack(tp)[sort_inds] + fp = np.hstack(fp)[sort_inds] + + # calculate recall and precision with tp and fp + # num_det*num_res + tp = np.cumsum(tp, axis=0) + fp = np.cumsum(fp, axis=0) + eps = np.finfo(np.float32).eps + recalls = tp / np.maximum(num_gts, eps) + precisions = tp / np.maximum((tp + fp), eps) + + # calculate AP + # if dataset != 'voc07' else '11points' + mode = 'area' + ap = average_precision(recalls, precisions, mode) + eval_results.append({ + 'num_gts': num_gts, + 'num_dets': num_dets, + 'recall': recalls, + 'precision': precisions, + 'ap': ap + }) + print('cls:{} done in {:2f}s!!'.format(clsname,float(timer.since_last_check()))) + pool.close() + aps = [] + for cls_result in eval_results: + if cls_result['num_gts'] > 0: + aps.append(cls_result['ap']) + mean_ap = np.array(aps).mean().item() if len(aps) else 0.0 + + print_map_summary( + mean_ap, eval_results, class_name=cls_names, logger=logger) + + return mean_ap, eval_results + + + +def print_map_summary(mean_ap, + results, + class_name=None, + scale_ranges=None, + logger=None): + """Print mAP and results of each class. + + A table will be printed to show the gts/dets/recall/AP of each class and + the mAP. + + Args: + mean_ap (float): Calculated from `eval_map()`. + results (list[dict]): Calculated from `eval_map()`. + dataset (list[str] | str | None): Dataset name or dataset classes. + scale_ranges (list[tuple] | None): Range of scales to be evaluated. + logger (logging.Logger | str | None): The way to print the mAP + summary. See `mmcv.utils.print_log()` for details. Default: None. + """ + + if logger == 'silent': + return + + if isinstance(results[0]['ap'], np.ndarray): + num_scales = len(results[0]['ap']) + else: + num_scales = 1 + + if scale_ranges is not None: + assert len(scale_ranges) == num_scales + + num_classes = len(results) + + recalls = np.zeros((num_scales, num_classes), dtype=np.float32) + aps = np.zeros((num_scales, num_classes), dtype=np.float32) + num_gts = np.zeros((num_scales, num_classes), dtype=int) + for i, cls_result in enumerate(results): + if cls_result['recall'].size > 0: + recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1] + aps[:, i] = cls_result['ap'] + num_gts[:, i] = cls_result['num_gts'] + + label_names = class_name + + if not isinstance(mean_ap, list): + mean_ap = [mean_ap] + + header = ['class', 'gts', 'dets', 'recall', 'ap'] + for i in range(num_scales): + if scale_ranges is not None: + print_log(f'Scale range {scale_ranges[i]}', logger=logger) + table_data = [header] + for j in range(num_classes): + row_data = [ + label_names[j], num_gts[i, j], results[j]['num_dets'], + f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}' + ] + table_data.append(row_data) + table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}']) + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print_log('\n' + table.table, logger=logger) diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py new file mode 100644 index 0000000000000000000000000000000000000000..00ed7ab05c7579a861eb9c8e45eb4a44db9ac71d --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py @@ -0,0 +1,82 @@ +import mmcv +import numpy as np + +from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps +from .tpfp_chamfer import custom_polyline_score +from shapely.geometry import LineString, Polygon + + +def custom_tpfp_gen(gen_lines, + gt_lines, + threshold=0.5, + metric='chamfer'): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Default: None + iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Default: False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + if metric == 'chamfer': + if threshold >0: + threshold= -threshold + # else: + # raise NotImplementedError + + # import pdb;pdb.set_trace() + num_gens = gen_lines.shape[0] + num_gts = gt_lines.shape[0] + + # tp and fp + tp = np.zeros((num_gens), dtype=np.float32) + fp = np.zeros((num_gens), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + if num_gts == 0: + fp[...] = 1 + return tp, fp + + if num_gens == 0: + return tp, fp + + gen_scores = gen_lines[:,-1] # n + # distance matrix: n x m + + matrix = custom_polyline_score( + gen_lines[:,:-1].reshape(num_gens,-1,2), + gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric) + # for each det, the max iou with all gts + matrix_max = matrix.max(axis=1) + # for each det, which gt overlaps most with it + matrix_argmax = matrix.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-gen_scores) + + gt_covered = np.zeros(num_gts, dtype=bool) + + # tp = 0 and fp = 0 means ignore this detected bbox, + for i in sort_inds: + if matrix_max[i] >= threshold: + matched_gt = matrix_argmax[i] + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[i] = 1 + else: + fp[i] = 1 + else: + fp[i] = 1 + + return tp, fp + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py new file mode 100644 index 0000000000000000000000000000000000000000..16ebf40633a88a321978de4907e9305639a64cfd --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py @@ -0,0 +1,127 @@ +# from ..chamfer_dist import ChamferDistance +import numpy as np +from shapely.geometry import LineString, Polygon +from shapely.strtree import STRtree +from shapely.geometry import CAP_STYLE, JOIN_STYLE +from scipy.spatial import distance + + +def custom_polyline_score(pred_lines, gt_lines, linewidth=1., metric='chamfer'): + ''' + each line with 1 meter width + pred_lines: num_preds, List [npts, 2] + gt_lines: num_gts, npts, 2 + gt_mask: num_gts, npts, 2 + ''' + if metric == 'iou': + linewidth = 1.0 + positive_threshold = 1. + num_preds = len(pred_lines) + num_gts = len(gt_lines) + line_length = pred_lines.shape[1] + + # gt_lines = gt_lines + np.array((1.,1.)) + + pred_lines_shapely = \ + [LineString(i).buffer(linewidth, + cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + for i in pred_lines] + gt_lines_shapely =\ + [LineString(i).buffer(linewidth, + cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + for i in gt_lines] + + # construct tree + tree = STRtree(pred_lines_shapely) + index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) + + + if metric=='chamfer': + chamfer_matrix = np.full((num_preds, num_gts), -100.) + elif metric=='iou': + chamfer_matrix = np.zeros((num_preds, num_gts),dtype=np.float64) + else: + raise NotImplementedError + + for i, gt_line in enumerate(gt_lines_shapely): + + for o in tree.query(gt_line): + if o.intersects(gt_line): + pred_id = index_by_id[id(o)] + + if metric=='chamfer': + dist_mat = distance.cdist( + pred_lines[pred_id], gt_lines[i], 'euclidean') + # import pdb;pdb.set_trace() + valid_ab = dist_mat.min(-1).mean() + valid_ba = dist_mat.min(-2).mean() + + chamfer_matrix[pred_id, i] = -(valid_ba+valid_ab)/2 + elif metric=='iou': + inter = o.intersection(gt_line).area + union = o.union(gt_line).area + chamfer_matrix[pred_id, i] = inter / union + + return chamfer_matrix + +if __name__ == '__main__': + import torch + + line1 = torch.tensor([ + [1, 5], [3, 5], [5, 5] + ]) + + line0 = torch.tensor([ + [3, 6], [4, 8], [5, 6] + ]) + + line2 = torch.tensor([ + [1, 4], [3, 4], [5, 4] + ]) + + line3 = torch.tensor([ + [4, 4], [3, 3], [5, 3] + ]) + + gt = torch.stack((line2, line3), dim=0).type(torch.float32) + pred = torch.stack((line0, line1), dim=0).type(torch.float32) + + # import ipdb; ipdb.set_trace() + import mmcv + # with mmcv.Timer(): + # gt = upsampler(gt, pts=10) + # pred = upsampler(pred, pts=10) + + import matplotlib.pyplot as plt + from shapely.geometry import LineString + from descartes import PolygonPatch + + iou_matrix = vec_iou(pred,gt) + print(iou_matrix) + # import pdb;pdb.set_trace() + score_matrix = custom_polyline_score(pred, gt, linewidth=1., metric='chamfer') + print(score_matrix) + fig, ax = plt.subplots() + for i in gt: + i = i.numpy() + plt.plot(i[:, 0], i[:, 1], 'o', color='red') + plt.plot(i[:, 0], i[:, 1], '-', color='red') + + dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round) + patch1 = PolygonPatch(dilated, fc='red', ec='red', alpha=0.5, zorder=-1) + ax.add_patch(patch1) + + for i in pred: + i = i.numpy() + plt.plot(i[:, 0], i[:, 1], 'o', color='blue') + plt.plot(i[:, 0], i[:, 1], '-', color='blue') + + dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + patch1 = PolygonPatch(dilated, fc='blue', ec='blue', alpha=0.5, zorder=-1) + ax.add_patch(patch1) + + + ax.axis('equal') + + + plt.savefig('test3.png') \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..02913ab15d03e8762d6eb4eec3b72b2890dd7432 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py @@ -0,0 +1,258 @@ +import copy + +import numpy as np +from mmdet.datasets import DATASETS +from mmdet3d.datasets import NuScenesDataset +import mmcv +from os import path as osp +from mmdet.datasets import DATASETS +import torch +import numpy as np +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from .nuscnes_eval import NuScenesEval_custom +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.parallel import DataContainer as DC +import random + + +@DATASETS.register_module() +class CustomNuScenesDataset(NuScenesDataset): + r"""NuScenes Dataset. + + This datset only add camera intrinsics and extrinsics to the results. + """ + + def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs): + super().__init__(*args, **kwargs) + self.queue_length = queue_length + self.overlap_test = overlap_test + self.bev_size = bev_size + + def prepare_train_data(self, index): + """ + Training data preparation. + Args: + index (int): Index for accessing the target data. + Returns: + dict: Training data dict of the corresponding index. + """ + data_queue = [] + + # temporal aug + prev_indexs_list = list(range(index-self.queue_length, index)) + random.shuffle(prev_indexs_list) + prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True) + ## + + input_dict = self.get_data_info(index) + if input_dict is None: + return None + frame_idx = input_dict['frame_idx'] + scene_token = input_dict['scene_token'] + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + if self.filter_empty_gt and \ + (example is None or ~(example['gt_labels_3d']._data != -1).any()): + return None + data_queue.insert(0, example) + for i in prev_indexs_list: + i = max(0, i) + input_dict = self.get_data_info(i) + if input_dict is None: + return None + if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token: + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + if self.filter_empty_gt and \ + (example is None or ~(example['gt_labels_3d']._data != -1).any()): + return None + frame_idx = input_dict['frame_idx'] + data_queue.insert(0, copy.deepcopy(example)) + return self.union2one(data_queue) + + def union2one(self, queue): + """ + convert sample queue into one single sample. + """ + imgs_list = [each['img'].data for each in queue] + metas_map = {} + prev_pos = None + prev_angle = None + for i, each in enumerate(queue): + metas_map[i] = each['img_metas'].data + if i == 0: + metas_map[i]['prev_bev'] = False + prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] = 0 + metas_map[i]['can_bus'][-1] = 0 + else: + metas_map[i]['prev_bev'] = True + tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] -= prev_pos + metas_map[i]['can_bus'][-1] -= prev_angle + prev_pos = copy.deepcopy(tmp_pos) + prev_angle = copy.deepcopy(tmp_angle) + + queue[-1]['img'] = DC(torch.stack(imgs_list), + cpu_only=False, stack=True) + queue[-1]['img_metas'] = DC(metas_map, cpu_only=True) + queue = queue[-1] + return queue + + def get_data_info(self, index): + """Get data info according to the given index. + + Args: + index (int): Index of the sample data to get. + + Returns: + dict: Data information that will be passed to the data \ + preprocessing pipelines. It includes the following keys: + + - sample_idx (str): Sample index. + - pts_filename (str): Filename of point clouds. + - sweeps (list[dict]): Infos of sweeps. + - timestamp (float): Sample timestamp. + - img_filename (str, optional): Image filename. + - lidar2img (list[np.ndarray], optional): Transformations \ + from lidar to different cameras. + - ann_info (dict): Annotation info. + """ + info = self.data_infos[index] + # standard protocal modified from SECOND.Pytorch + input_dict = dict( + sample_idx=info['token'], + pts_filename=info['lidar_path'], + sweeps=info['sweeps'], + ego2global_translation=info['ego2global_translation'], + ego2global_rotation=info['ego2global_rotation'], + prev_idx=info['prev'], + next_idx=info['next'], + scene_token=info['scene_token'], + can_bus=info['can_bus'], + frame_idx=info['frame_idx'], + timestamp=info['timestamp'] / 1e6, + ) + + if self.modality['use_camera']: + image_paths = [] + lidar2img_rts = [] + lidar2cam_rts = [] + cam_intrinsics = [] + for cam_type, cam_info in info['cams'].items(): + image_paths.append(cam_info['data_path']) + # obtain lidar to image transformation matrix + lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) + lidar2cam_t = cam_info[ + 'sensor2lidar_translation'] @ lidar2cam_r.T + lidar2cam_rt = np.eye(4) + lidar2cam_rt[:3, :3] = lidar2cam_r.T + lidar2cam_rt[3, :3] = -lidar2cam_t + intrinsic = cam_info['cam_intrinsic'] + viewpad = np.eye(4) + viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic + lidar2img_rt = (viewpad @ lidar2cam_rt.T) + lidar2img_rts.append(lidar2img_rt) + + cam_intrinsics.append(viewpad) + lidar2cam_rts.append(lidar2cam_rt.T) + + input_dict.update( + dict( + img_filename=image_paths, + lidar2img=lidar2img_rts, + cam_intrinsic=cam_intrinsics, + lidar2cam=lidar2cam_rts, + )) + + if not self.test_mode: + annos = self.get_ann_info(index) + input_dict['ann_info'] = annos + + rotation = Quaternion(input_dict['ego2global_rotation']) + translation = input_dict['ego2global_translation'] + can_bus = input_dict['can_bus'] + can_bus[:3] = translation + can_bus[3:7] = rotation + patch_angle = quaternion_yaw(rotation) / np.pi * 180 + if patch_angle < 0: + patch_angle += 360 + can_bus[-2] = patch_angle / 180 * np.pi + can_bus[-1] = patch_angle + + return input_dict + + def __getitem__(self, idx): + """Get item from infos according to the given index. + Returns: + dict: Data dictionary of the corresponding index. + """ + if self.test_mode: + return self.prepare_test_data(idx) + while True: + + data = self.prepare_train_data(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + + def _evaluate_single(self, + result_path, + logger=None, + metric='bbox', + result_name='pts_bbox'): + """Evaluation for a single model in nuScenes protocol. + + Args: + result_path (str): Path of the result file. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + metric (str): Metric name used for evaluation. Default: 'bbox'. + result_name (str): Result name in the metric prefix. + Default: 'pts_bbox'. + + Returns: + dict: Dictionary of evaluation details. + """ + from nuscenes import NuScenes + self.nusc = NuScenes(version=self.version, dataroot=self.data_root, + verbose=True) + + output_dir = osp.join(*osp.split(result_path)[:-1]) + + eval_set_map = { + 'v1.0-mini': 'mini_val', + 'v1.0-trainval': 'val', + } + self.nusc_eval = NuScenesEval_custom( + self.nusc, + config=self.eval_detection_configs, + result_path=result_path, + eval_set=eval_set_map[self.version], + output_dir=output_dir, + verbose=True, + overlap_test=self.overlap_test, + data_infos=self.data_infos + ) + self.nusc_eval.main(plot_examples=0, render_curves=False) + # record metrics + metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) + detail = dict() + metric_prefix = f'{result_name}_NuScenes' + for name in self.CLASSES: + for k, v in metrics['label_aps'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['label_tp_errors'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['tp_errors'].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}'.format(metric_prefix, + self.ErrNameMapping[k])] = val + detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] + detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] + return detail diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_map_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_map_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..5becf7c79f2001ca9cab224c26dc43ade2ff4cf2 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_map_dataset.py @@ -0,0 +1,1580 @@ +import copy + +import numpy as np +from mmdet.datasets import DATASETS +from mmdet3d.datasets import NuScenesDataset +import mmcv +import os +from os import path as osp +from mmdet.datasets import DATASETS +import torch +import numpy as np +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from .nuscnes_eval import NuScenesEval_custom +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.parallel import DataContainer as DC +import random + +from .nuscenes_dataset import CustomNuScenesDataset +from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from shapely import affinity, ops +from shapely.geometry import LineString, box, MultiPolygon, MultiLineString +from mmdet.datasets.pipelines import to_tensor +import json + + +def add_rotation_noise(extrinsics, std=0.01, mean=0.0): + #n = extrinsics.shape[0] + noise_angle = torch.normal(mean, std=std, size=(3,)) + # extrinsics[:, 0:3, 0:3] *= (1 + noise) + sin_noise = torch.sin(noise_angle) + cos_noise = torch.cos(noise_angle) + rotation_matrix = torch.eye(4).view(4, 4) + # rotation_matrix[] + rotation_matrix_x = rotation_matrix.clone() + rotation_matrix_x[1, 1] = cos_noise[0] + rotation_matrix_x[1, 2] = sin_noise[0] + rotation_matrix_x[2, 1] = -sin_noise[0] + rotation_matrix_x[2, 2] = cos_noise[0] + + rotation_matrix_y = rotation_matrix.clone() + rotation_matrix_y[0, 0] = cos_noise[1] + rotation_matrix_y[0, 2] = -sin_noise[1] + rotation_matrix_y[2, 0] = sin_noise[1] + rotation_matrix_y[2, 2] = cos_noise[1] + + rotation_matrix_z = rotation_matrix.clone() + rotation_matrix_z[0, 0] = cos_noise[2] + rotation_matrix_z[0, 1] = sin_noise[2] + rotation_matrix_z[1, 0] = -sin_noise[2] + rotation_matrix_z[1, 1] = cos_noise[2] + + rotation_matrix = rotation_matrix_x @ rotation_matrix_y @ rotation_matrix_z + + rotation = torch.from_numpy(extrinsics.astype(np.float32)) + rotation[:3, -1] = 0.0 + # import pdb;pdb.set_trace() + rotation = rotation_matrix @ rotation + extrinsics[:3, :3] = rotation[:3, :3].numpy() + return extrinsics + + +def add_translation_noise(extrinsics, std=0.01, mean=0.0): + # n = extrinsics.shape[0] + noise = torch.normal(mean, std=std, size=(3,)) + extrinsics[0:3, -1] += noise.numpy() + return extrinsics + +class LiDARInstanceLines(object): + """Line instance in LIDAR coordinates + + """ + def __init__(self, + instance_line_list, + sample_dist=1, + num_samples=250, + padding=False, + fixed_num=-1, + padding_value=-10000, + patch_size=None): + assert isinstance(instance_line_list, list) + assert patch_size is not None + if len(instance_line_list) != 0: + assert isinstance(instance_line_list[0], LineString) + self.patch_size = patch_size + self.max_x = self.patch_size[1] / 2 + self.max_y = self.patch_size[0] / 2 + self.sample_dist = sample_dist + self.num_samples = num_samples + self.padding = padding + self.fixed_num = fixed_num + self.padding_value = padding_value + + self.instance_list = instance_line_list + + @property + def start_end_points(self): + """ + return torch.Tensor([N,4]), in xstart, ystart, xend, yend form + """ + assert len(self.instance_list) != 0 + instance_se_points_list = [] + for instance in self.instance_list: + se_points = [] + se_points.extend(instance.coords[0]) + se_points.extend(instance.coords[-1]) + instance_se_points_list.append(se_points) + instance_se_points_array = np.array(instance_se_points_list) + instance_se_points_tensor = to_tensor(instance_se_points_array) + instance_se_points_tensor = instance_se_points_tensor.to( + dtype=torch.float32) + instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x) + instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y) + instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x) + instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y) + return instance_se_points_tensor + + @property + def bbox(self): + """ + return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form + """ + assert len(self.instance_list) != 0 + instance_bbox_list = [] + for instance in self.instance_list: + # bounds is bbox: [xmin, ymin, xmax, ymax] + instance_bbox_list.append(instance.bounds) + instance_bbox_array = np.array(instance_bbox_list) + instance_bbox_tensor = to_tensor(instance_bbox_array) + instance_bbox_tensor = instance_bbox_tensor.to( + dtype=torch.float32) + instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x) + instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y) + instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x) + instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y) + return instance_bbox_tensor + + @property + def fixed_num_sampled_points(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + instance_points_list.append(sampled_points) + instance_points_array = np.array(instance_points_list) + instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + return instance_points_tensor + + @property + def fixed_num_sampled_points_ambiguity(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + instance_points_list.append(sampled_points) + instance_points_array = np.array(instance_points_list) + instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + instance_points_tensor = instance_points_tensor.unsqueeze(1) + return instance_points_tensor + + @property + def fixed_num_sampled_points_torch(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + # distances = np.linspace(0, instance.length, self.fixed_num) + # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + poly_pts = to_tensor(np.array(list(instance.coords))) + poly_pts = poly_pts.unsqueeze(0).permute(0,2,1) + sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True) + sampled_pts = sampled_pts.permute(0,2,1).squeeze(0) + instance_points_list.append(sampled_pts) + # instance_points_array = np.array(instance_points_list) + # instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = torch.stack(instance_points_list,dim=0) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + return instance_points_tensor + + @property + def shift_fixed_num_sampled_points(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + fixed_num = fixed_num_pts.shape[0] + shift_pts_list = [] + if is_poly: + # import pdb;pdb.set_trace() + for shift_right_i in range(fixed_num): + shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v1(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + pts_num = fixed_num_pts.shape[0] + shift_num = pts_num - 1 + if is_poly: + pts_to_shift = fixed_num_pts[:-1,:] + shift_pts_list = [] + if is_poly: + for shift_right_i in range(shift_num): + shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + if is_poly: + _, _, num_coords = shift_pts.shape + tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords)) + tmp_shift_pts[:,:-1,:] = shift_pts + tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] + shift_pts = tmp_shift_pts + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v2(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + assert len(self.instance_list) != 0 + instances_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + poly_pts = np.array(list(instance.coords)) + start_pts = poly_pts[0] + end_pts = poly_pts[-1] + is_poly = np.equal(start_pts, end_pts) + is_poly = is_poly.all() + shift_pts_list = [] + pts_num, coords_num = poly_pts.shape + shift_num = pts_num - 1 + final_shift_num = self.fixed_num - 1 + if is_poly: + pts_to_shift = poly_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + shift_pts_list.append(shift_sampled_points) + # import pdb;pdb.set_trace() + else: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + flip_sampled_points = np.flip(sampled_points, axis=0) + shift_pts_list.append(sampled_points) + shift_pts_list.append(flip_sampled_points) + + multi_shifts_pts = np.stack(shift_pts_list,axis=0) + shifts_num,_,_ = multi_shifts_pts.shape + + if shifts_num > final_shift_num: + index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False) + multi_shifts_pts = multi_shifts_pts[index] + + multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) + multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( + dtype=torch.float32) + + multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) + multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) + # if not is_poly: + if multi_shifts_pts_tensor.shape[0] < final_shift_num: + padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value) + multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) + instances_list.append(multi_shifts_pts_tensor) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v3(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + assert len(self.instance_list) != 0 + instances_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + poly_pts = np.array(list(instance.coords)) + start_pts = poly_pts[0] + end_pts = poly_pts[-1] + is_poly = np.equal(start_pts, end_pts) + is_poly = is_poly.all() + shift_pts_list = [] + pts_num, coords_num = poly_pts.shape + shift_num = pts_num - 1 + final_shift_num = self.fixed_num - 1 + if is_poly: + pts_to_shift = poly_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + shift_pts_list.append(shift_sampled_points) + flip_pts_to_shift = np.flip(pts_to_shift, axis=0) + for shift_right_i in range(shift_num): + shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + shift_pts_list.append(shift_sampled_points) + else: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + flip_sampled_points = np.flip(sampled_points, axis=0) + shift_pts_list.append(sampled_points) + shift_pts_list.append(flip_sampled_points) + + multi_shifts_pts = np.stack(shift_pts_list,axis=0) + shifts_num,_,_ = multi_shifts_pts.shape + if shifts_num > 2*final_shift_num: + index = np.random.choice(shift_num, final_shift_num, replace=False) + flip0_shifts_pts = multi_shifts_pts[index] + flip1_shifts_pts = multi_shifts_pts[index+shift_num] + multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0) + + multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) + multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( + dtype=torch.float32) + + multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) + multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) + if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num: + padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value) + multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) + instances_list.append(multi_shifts_pts_tensor) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v4(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + for fixed_num_pts in fixed_num_sampled_points: + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + pts_num = fixed_num_pts.shape[0] + shift_num = pts_num - 1 + shift_pts_list = [] + if is_poly: + pts_to_shift = fixed_num_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) + flip_pts_to_shift = pts_to_shift.flip(0) + for shift_right_i in range(shift_num): + shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + if is_poly: + _, _, num_coords = shift_pts.shape + tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords)) + tmp_shift_pts[:,:-1,:] = shift_pts + tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] + shift_pts = tmp_shift_pts + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v5(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + assert len(self.instance_list) != 0 + instances_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + is_poly = np.equal(sampled_points[0], sampled_points[-1]) + is_poly = is_poly.all() + shift_pts_list = [] + if is_poly: + pts_to_shift = sampled_points[:-1,:] + for shift_right_i in range(0, self.fixed_num, 2): + shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) + shift_pts = np.vstack((shift_pts, shift_pts[0])) + shift_pts_list.append(shift_pts) + shift_pts_list.append(np.flip(shift_pts, axis=0)) + else: + shift_pts_list.append(sampled_points) + shift_pts_list.append(np.flip(sampled_points, axis=0)) + + multi_shifts_pts = np.stack(shift_pts_list,axis=0) + multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) + multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( + dtype=torch.float32) + + multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) + multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) + # if not is_poly: + if multi_shifts_pts_tensor.shape[0] < self.fixed_num: + padding = torch.full([self.fixed_num - multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value) + multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) + instances_list.append(multi_shifts_pts_tensor) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_torch(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points_torch + instances_list = [] + is_poly = False + + for fixed_num_pts in fixed_num_sampled_points: + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + fixed_num = fixed_num_pts.shape[0] + shift_pts_list = [] + if is_poly: + for shift_right_i in range(fixed_num): + shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + + +class VectorizedLocalMap(object): + CLASS2LABEL = { + 'road_divider': 0, + 'lane_divider': 0, + 'ped_crossing': 1, + 'contours': 2, + 'others': -1 + } + def __init__(self, + dataroot, + patch_size, + map_classes=('divider','ped_crossing','boundary'), + line_classes=('road_divider', 'lane_divider'), + ped_crossing_classes=('ped_crossing'), + contour_classes=('road_segment', 'lane'), + sample_dist=1, + num_samples=250, + padding=False, + fixed_ptsnum_per_line=-1, + padding_value=-10000,): + ''' + Args: + fixed_ptsnum_per_line = -1 : no fixed num + ''' + super().__init__() + self.data_root = dataroot + self.MAPS = ['boston-seaport', 'singapore-hollandvillage', + 'singapore-onenorth', 'singapore-queenstown'] + self.vec_classes = map_classes + self.line_classes = line_classes + self.ped_crossing_classes = ped_crossing_classes + self.polygon_classes = contour_classes + self.nusc_maps = {} + self.map_explorer = {} + for loc in self.MAPS: + self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc) + self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc]) + + self.patch_size = patch_size + self.sample_dist = sample_dist + self.num_samples = num_samples + self.padding = padding + self.fixed_num = fixed_ptsnum_per_line + self.padding_value = padding_value + + def gen_vectorized_samples(self, location, lidar2global_translation, lidar2global_rotation): + ''' + use lidar2global to get gt map layers + ''' + + map_pose = lidar2global_translation[:2] + rotation = Quaternion(lidar2global_rotation) + + patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1]) + patch_angle = quaternion_yaw(rotation) / np.pi * 180 + vectors = [] + for vec_class in self.vec_classes: + if vec_class == 'divider': + line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location) + line_instances_dict = self.line_geoms_to_instances(line_geom) + for line_type, instances in line_instances_dict.items(): + for instance in instances: + vectors.append((instance, self.CLASS2LABEL.get(line_type, -1))) + elif vec_class == 'ped_crossing': + ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location) + ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom) + for instance in ped_instance_list: + vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1))) + elif vec_class == 'boundary': + polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location) + poly_bound_list = self.poly_geoms_to_instances(polygon_geom) + for contour in poly_bound_list: + vectors.append((contour, self.CLASS2LABEL.get('contours', -1))) + else: + raise ValueError(f'WRONG vec_class: {vec_class}') + + filtered_vectors = [] + gt_pts_loc_3d = [] + gt_pts_num_3d = [] + gt_labels = [] + gt_instance = [] + for instance, type in vectors: + if type != -1: + gt_instance.append(instance) + gt_labels.append(type) + + gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist, + self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size) + + anns_results = dict( + gt_vecs_pts_loc=gt_instance, + gt_vecs_label=gt_labels, + + ) + return anns_results + + def get_map_geom(self, patch_box, patch_angle, layer_names, location): + map_geom = [] + for layer_name in layer_names: + if layer_name in self.line_classes: + geoms = self.get_divider_line(patch_box, patch_angle, layer_name, location) + map_geom.append((layer_name, geoms)) + elif layer_name in self.polygon_classes: + geoms = self.get_contour_line(patch_box, patch_angle, layer_name, location) + map_geom.append((layer_name, geoms)) + elif layer_name in self.ped_crossing_classes: + geoms = self.get_ped_crossing_line(patch_box, patch_angle, location) + map_geom.append((layer_name, geoms)) + return map_geom + + def _one_type_line_geom_to_vectors(self, line_geom): + line_vectors = [] + + for line in line_geom: + if not line.is_empty: + if line.geom_type == 'MultiLineString': + for single_line in line.geoms: + line_vectors.append(self.sample_pts_from_line(single_line)) + elif line.geom_type == 'LineString': + line_vectors.append(self.sample_pts_from_line(line)) + else: + raise NotImplementedError + return line_vectors + + def _one_type_line_geom_to_instances(self, line_geom): + line_instances = [] + + for line in line_geom: + if not line.is_empty: + if line.geom_type == 'MultiLineString': + for single_line in line.geoms: + line_instances.append(single_line) + elif line.geom_type == 'LineString': + line_instances.append(line) + else: + raise NotImplementedError + return line_instances + + def poly_geoms_to_vectors(self, polygon_geom): + roads = polygon_geom[0][1] + lanes = polygon_geom[1][1] + union_roads = ops.unary_union(roads) + union_lanes = ops.unary_union(lanes) + union_segments = ops.unary_union([union_roads, union_lanes]) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + exteriors = [] + interiors = [] + if union_segments.geom_type != 'MultiPolygon': + union_segments = MultiPolygon([union_segments]) + for poly in union_segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_vectors(results) + + def ped_poly_geoms_to_instances(self, ped_geom): + ped = ped_geom[0][1] + union_segments = ops.unary_union(ped) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2) + exteriors = [] + interiors = [] + if union_segments.geom_type != 'MultiPolygon': + union_segments = MultiPolygon([union_segments]) + for poly in union_segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_instances(results) + + + def poly_geoms_to_instances(self, polygon_geom): + roads = polygon_geom[0][1] + lanes = polygon_geom[1][1] + union_roads = ops.unary_union(roads) + union_lanes = ops.unary_union(lanes) + union_segments = ops.unary_union([union_roads, union_lanes]) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + exteriors = [] + interiors = [] + if union_segments.geom_type != 'MultiPolygon': + union_segments = MultiPolygon([union_segments]) + for poly in union_segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_instances(results) + + def line_geoms_to_vectors(self, line_geom): + line_vectors_dict = dict() + for line_type, a_type_of_lines in line_geom: + one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines) + line_vectors_dict[line_type] = one_type_vectors + + return line_vectors_dict + def line_geoms_to_instances(self, line_geom): + line_instances_dict = dict() + for line_type, a_type_of_lines in line_geom: + one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines) + line_instances_dict[line_type] = one_type_instances + + return line_instances_dict + + def ped_geoms_to_vectors(self, ped_geom): + ped_geom = ped_geom[0][1] + union_ped = ops.unary_union(ped_geom) + if union_ped.geom_type != 'MultiPolygon': + union_ped = MultiPolygon([union_ped]) + + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + results = [] + for ped_poly in union_ped: + # rect = ped_poly.minimum_rotated_rectangle + ext = ped_poly.exterior + if not ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + results.append(lines) + + return self._one_type_line_geom_to_vectors(results) + + def get_contour_line(self,patch_box,patch_angle,layer_name,location): + if layer_name not in self.map_explorer[location].map_api.non_geometric_polygon_layers: + raise ValueError('{} is not a polygonal layer'.format(layer_name)) + + patch_x = patch_box[0] + patch_y = patch_box[1] + + patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) + + records = getattr(self.map_explorer[location].map_api, layer_name) + + polygon_list = [] + if layer_name == 'drivable_area': + for record in records: + polygons = [self.map_explorer[location].map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] + + for polygon in polygons: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + new_polygon = affinity.rotate(new_polygon, -patch_angle, + origin=(patch_x, patch_y), use_radians=False) + new_polygon = affinity.affine_transform(new_polygon, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + + else: + for record in records: + polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token']) + + if polygon.is_valid: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + new_polygon = affinity.rotate(new_polygon, -patch_angle, + origin=(patch_x, patch_y), use_radians=False) + new_polygon = affinity.affine_transform(new_polygon, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + + return polygon_list + + def get_divider_line(self,patch_box,patch_angle,layer_name,location): + if layer_name not in self.map_explorer[location].map_api.non_geometric_line_layers: + raise ValueError("{} is not a line layer".format(layer_name)) + + if layer_name == 'traffic_light': + return None + + patch_x = patch_box[0] + patch_y = patch_box[1] + + patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) + + line_list = [] + records = getattr(self.map_explorer[location].map_api, layer_name) + for record in records: + line = self.map_explorer[location].map_api.extract_line(record['line_token']) + if line.is_empty: # Skip lines without nodes. + continue + + new_line = line.intersection(patch) + if not new_line.is_empty: + new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False) + new_line = affinity.affine_transform(new_line, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + line_list.append(new_line) + + return line_list + + def get_ped_crossing_line(self, patch_box, patch_angle, location): + patch_x = patch_box[0] + patch_y = patch_box[1] + + patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) + polygon_list = [] + records = getattr(self.map_explorer[location].map_api, 'ped_crossing') + # records = getattr(self.nusc_maps[location], 'ped_crossing') + for record in records: + polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token']) + if polygon.is_valid: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + new_polygon = affinity.rotate(new_polygon, -patch_angle, + origin=(patch_x, patch_y), use_radians=False) + new_polygon = affinity.affine_transform(new_polygon, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + + return polygon_list + + def sample_pts_from_line(self, line): + if self.fixed_num < 0: + distances = np.arange(0, line.length, self.sample_dist) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + else: + # fixed number of points, so distance is line.length / self.fixed_num + distances = np.linspace(0, line.length, self.fixed_num) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + + + num_valid = len(sampled_points) + + if not self.padding or self.fixed_num > 0: + return sampled_points, num_valid + + # fixed distance sampling need padding! + num_valid = len(sampled_points) + + if self.fixed_num < 0: + if num_valid < self.num_samples: + padding = np.zeros((self.num_samples - len(sampled_points), 2)) + sampled_points = np.concatenate([sampled_points, padding], axis=0) + else: + sampled_points = sampled_points[:self.num_samples, :] + num_valid = self.num_samples + + + return sampled_points, num_valid + + +@DATASETS.register_module() +class CustomNuScenesLocalMapDataset(CustomNuScenesDataset): + r"""NuScenes Dataset. + + This datset add static map elements + """ + MAPCLASSES = ('divider',) + def __init__(self, + map_ann_file=None, + queue_length=4, + bev_size=(200, 200), + pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0), + overlap_test=False, + fixed_ptsnum_per_line=-1, + eval_use_same_gt_sample_num_flag=False, + padding_value=-10000, + map_classes=None, + noise='None', + noise_std=0, + *args, + **kwargs): + super().__init__(*args, **kwargs) + self.map_ann_file = map_ann_file + + self.queue_length = queue_length + self.overlap_test = overlap_test + self.bev_size = bev_size + + self.MAPCLASSES = self.get_map_classes(map_classes) + self.NUM_MAPCLASSES = len(self.MAPCLASSES) + self.pc_range = pc_range + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + self.patch_size = (patch_h, patch_w) + self.padding_value = padding_value + self.fixed_num = fixed_ptsnum_per_line + self.eval_use_same_gt_sample_num_flag = eval_use_same_gt_sample_num_flag + self.vector_map = VectorizedLocalMap(kwargs['data_root'], + patch_size=self.patch_size, map_classes=self.MAPCLASSES, + fixed_ptsnum_per_line=fixed_ptsnum_per_line, + padding_value=self.padding_value) + self.is_vis_on_test = False + self.noise = noise + self.noise_std = noise_std + @classmethod + def get_map_classes(cls, map_classes=None): + """Get class names of current dataset. + + Args: + classes (Sequence[str] | str | None): If classes is None, use + default CLASSES defined by builtin dataset. If classes is a + string, take it as a file name. The file contains the name of + classes where each line contains one class name. If classes is + a tuple or list, override the CLASSES defined by the dataset. + + Return: + list[str]: A list of class names. + """ + if map_classes is None: + return cls.MAPCLASSES + + if isinstance(map_classes, str): + # take it as a file path + class_names = mmcv.list_from_file(map_classes) + elif isinstance(map_classes, (tuple, list)): + class_names = map_classes + else: + raise ValueError(f'Unsupported type {type(map_classes)} of map classes.') + + return class_names + def vectormap_pipeline(self, example, input_dict): + ''' + `example` type: + keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img'; + all keys type is 'DataContainer'; + 'img_metas' cpu_only=True, type is dict, others are false; + 'gt_labels_3d' shape torch.size([num_samples]), stack=False, + padding_value=0, cpu_only=False + 'gt_bboxes_3d': stack=False, cpu_only=True + ''' + # import pdb;pdb.set_trace() + lidar2ego = np.eye(4) + lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix + lidar2ego[:3, 3] = input_dict['lidar2ego_translation'] + ego2global = np.eye(4) + ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix + ego2global[:3, 3] = input_dict['ego2global_translation'] + + lidar2global = ego2global @ lidar2ego + + lidar2global_translation = list(lidar2global[:3,3]) + lidar2global_rotation = list(Quaternion(matrix=lidar2global).q) + + location = input_dict['map_location'] + ego2global_translation = input_dict['ego2global_translation'] + ego2global_rotation = input_dict['ego2global_rotation'] + anns_results = self.vector_map.gen_vectorized_samples(location, lidar2global_translation, lidar2global_rotation) + + ''' + anns_results, type: dict + 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates + 'gt_vecs_pts_num': list[num_vecs], vec with num_points + 'gt_vecs_label': list[num_vecs], vec with cls index + ''' + gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) + if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): + gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] + else: + gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) + try: + gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) + except Exception as e: + # empty tensor, will be passed in train, + # but we preserve it for test + gt_vecs_pts_loc = gt_vecs_pts_loc + example['gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False) + example['gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True) + return example + + def prepare_train_data(self, index): + """ + Training data preparation. + Args: + index (int): Index for accessing the target data. + Returns: + dict: Training data dict of the corresponding index. + """ + queue = [] + + index_list = list(range(index-self.queue_length, index)) + random.shuffle(index_list) + index_list = sorted(index_list[1:]) + index_list.append(index) + + for i in index_list: + i = max(0, i) + input_dict = self.get_data_info(i) + if input_dict is None: + return None + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + example = self.vectormap_pipeline(example,input_dict) + if self.filter_empty_gt and \ + (example is None or ~(example['gt_labels_3d']._data != -1).any()): + return None + queue.append(example) + return self.union2one(queue) + + def union2one(self, queue): + """ + convert sample queue into one single sample. + """ + imgs_list = [each['img'].data for each in queue] + metas_map = {} + prev_scene_token = None + prev_pos = None + prev_angle = None + for i, each in enumerate(queue): + metas_map[i] = each['img_metas'].data + if metas_map[i]['scene_token'] != prev_scene_token: + metas_map[i]['prev_bev_exists'] = False + prev_scene_token = metas_map[i]['scene_token'] + prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] = 0 + metas_map[i]['can_bus'][-1] = 0 + else: + metas_map[i]['prev_bev_exists'] = True + tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] -= prev_pos + metas_map[i]['can_bus'][-1] -= prev_angle + prev_pos = copy.deepcopy(tmp_pos) + prev_angle = copy.deepcopy(tmp_angle) + + queue[-1]['img'] = DC(torch.stack(imgs_list), + cpu_only=False, stack=True) + queue[-1]['img_metas'] = DC(metas_map, cpu_only=True) + queue = queue[-1] + return queue + + def get_data_info(self, index): + """Get data info according to the given index. + + Args: + index (int): Index of the sample data to get. + + Returns: + dict: Data information that will be passed to the data \ + preprocessing pipelines. It includes the following keys: + + - sample_idx (str): Sample index. + - pts_filename (str): Filename of point clouds. + - sweeps (list[dict]): Infos of sweeps. + - timestamp (float): Sample timestamp. + - img_filename (str, optional): Image filename. + - lidar2img (list[np.ndarray], optional): Transformations \ + from lidar to different cameras. + - ann_info (dict): Annotation info. + """ + info = self.data_infos[index] + # standard protocal modified from SECOND.Pytorch + input_dict = dict( + sample_idx=info['token'], + pts_filename=info['lidar_path'], + lidar_path=info["lidar_path"], + sweeps=info['sweeps'], + ego2global_translation=info['ego2global_translation'], + ego2global_rotation=info['ego2global_rotation'], + lidar2ego_translation=info['lidar2ego_translation'], + lidar2ego_rotation=info['lidar2ego_rotation'], + prev_idx=info['prev'], + next_idx=info['next'], + scene_token=info['scene_token'], + can_bus=info['can_bus'], + frame_idx=info['frame_idx'], + timestamp=info['timestamp'], + map_location = info['map_location'], + ) + # lidar to ego transform + lidar2ego = np.eye(4).astype(np.float32) + lidar2ego[:3, :3] = Quaternion(info["lidar2ego_rotation"]).rotation_matrix + lidar2ego[:3, 3] = info["lidar2ego_translation"] + input_dict["lidar2ego"] = lidar2ego + if self.modality['use_camera']: + image_paths = [] + lidar2img_rts = [] + lidar2cam_rts = [] + cam_intrinsics = [] + input_dict["camera2ego"] = [] + input_dict["camera_intrinsics"] = [] + for cam_type, cam_info in info['cams'].items(): + image_paths.append(cam_info['data_path']) + # obtain lidar to image transformation matrix + lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) + lidar2cam_t = cam_info[ + 'sensor2lidar_translation'] @ lidar2cam_r.T + lidar2cam_rt = np.eye(4) + lidar2cam_rt[:3, :3] = lidar2cam_r.T + lidar2cam_rt[3, :3] = -lidar2cam_t + lidar2cam_rt_t = lidar2cam_rt.T + + if self.noise == 'rotation': + lidar2cam_rt_t = add_rotation_noise(lidar2cam_rt_t, std=self.noise_std) + elif self.noise == 'translation': + lidar2cam_rt_t = add_translation_noise( + lidar2cam_rt_t, std=self.noise_std) + + intrinsic = cam_info['cam_intrinsic'] + viewpad = np.eye(4) + viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic + lidar2img_rt = (viewpad @ lidar2cam_rt_t) + lidar2img_rts.append(lidar2img_rt) + + cam_intrinsics.append(viewpad) + lidar2cam_rts.append(lidar2cam_rt_t) + + # camera to ego transform + camera2ego = np.eye(4).astype(np.float32) + camera2ego[:3, :3] = Quaternion( + cam_info["sensor2ego_rotation"] + ).rotation_matrix + camera2ego[:3, 3] = cam_info["sensor2ego_translation"] + input_dict["camera2ego"].append(camera2ego) + + # camera intrinsics + camera_intrinsics = np.eye(4).astype(np.float32) + camera_intrinsics[:3, :3] = cam_info["cam_intrinsic"] + input_dict["camera_intrinsics"].append(camera_intrinsics) + + input_dict.update( + dict( + img_filename=image_paths, + lidar2img=lidar2img_rts, + cam_intrinsic=cam_intrinsics, + lidar2cam=lidar2cam_rts, + )) + + if not self.test_mode: + annos = self.get_ann_info(index) + input_dict['ann_info'] = annos + + rotation = Quaternion(input_dict['ego2global_rotation']) + translation = input_dict['ego2global_translation'] + can_bus = input_dict['can_bus'] + can_bus[:3] = translation + can_bus[3:7] = rotation + patch_angle = quaternion_yaw(rotation) / np.pi * 180 + if patch_angle < 0: + patch_angle += 360 + can_bus[-2] = patch_angle / 180 * np.pi + can_bus[-1] = patch_angle + + + lidar2ego = np.eye(4) + lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix + lidar2ego[:3, 3] = input_dict['lidar2ego_translation'] + ego2global = np.eye(4) + ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix + ego2global[:3, 3] = input_dict['ego2global_translation'] + lidar2global = ego2global @ lidar2ego + input_dict['lidar2global'] = lidar2global + return input_dict + + def prepare_test_data(self, index): + """Prepare data for testing. + + Args: + index (int): Index for accessing the target data. + + Returns: + dict: Testing data dict of the corresponding index. + """ + input_dict = self.get_data_info(index) + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + if self.is_vis_on_test: + example = self.vectormap_pipeline(example, input_dict) + return example + + def __getitem__(self, idx): + """Get item from infos according to the given index. + Returns: + dict: Data dictionary of the corresponding index. + """ + if self.test_mode: + return self.prepare_test_data(idx) + while True: + + data = self.prepare_train_data(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + def _format_gt(self): + gt_annos = [] + print('Start to convert gt map format...') + assert self.map_ann_file is not None + if (not os.path.exists(self.map_ann_file)) : + dataset_length = len(self) + prog_bar = mmcv.ProgressBar(dataset_length) + mapped_class_names = self.MAPCLASSES + for sample_id in range(dataset_length): + sample_token = self.data_infos[sample_id]['token'] + gt_anno = {} + gt_anno['sample_token'] = sample_token + # gt_sample_annos = [] + gt_sample_dict = {} + gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id]) + gt_labels = gt_sample_dict['gt_labels_3d'].data.numpy() + gt_vecs = gt_sample_dict['gt_bboxes_3d'].data.instance_list + gt_vec_list = [] + for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)): + name = mapped_class_names[gt_label] + anno = dict( + pts=np.array(list(gt_vec.coords)), + pts_num=len(list(gt_vec.coords)), + cls_name=name, + type=gt_label, + ) + gt_vec_list.append(anno) + gt_anno['vectors']=gt_vec_list + gt_annos.append(gt_anno) + + prog_bar.update() + nusc_submissions = { + 'GTs': gt_annos + } + print('\n GT anns writes to', self.map_ann_file) + mmcv.dump(nusc_submissions, self.map_ann_file) + else: + print(f'{self.map_ann_file} exist, not update') + + def _format_bbox(self, results, jsonfile_prefix=None): + """Convert the results to the standard format. + + Args: + results (list[dict]): Testing results of the dataset. + jsonfile_prefix (str): The prefix of the output jsonfile. + You can specify the output directory/filename by + modifying the jsonfile_prefix. Default: None. + + Returns: + str: Path of the output json file. + """ + assert self.map_ann_file is not None + pred_annos = [] + mapped_class_names = self.MAPCLASSES + # import pdb;pdb.set_trace() + print('Start to convert map detection format...') + for sample_id, det in enumerate(mmcv.track_iter_progress(results)): + pred_anno = {} + vecs = output_to_vecs(det) + sample_token = self.data_infos[sample_id]['token'] + pred_anno['sample_token'] = sample_token + pred_vec_list=[] + for i, vec in enumerate(vecs): + name = mapped_class_names[vec['label']] + anno = dict( + pts=vec['pts'], + pts_num=len(vec['pts']), + cls_name=name, + type=vec['label'], + confidence_level=vec['score']) + pred_vec_list.append(anno) + + pred_anno['vectors'] = pred_vec_list + pred_annos.append(pred_anno) + + + if not os.path.exists(self.map_ann_file): + self._format_gt() + else: + print(f'{self.map_ann_file} exist, not update') + + nusc_submissions = { + 'meta': self.modality, + 'results': pred_annos, + + } + + mmcv.mkdir_or_exist(jsonfile_prefix) + res_path = osp.join(jsonfile_prefix, 'nuscmap_results.json') + print('Results writes to', res_path) + mmcv.dump(nusc_submissions, res_path) + return res_path + + def to_gt_vectors(self, + gt_dict): + # import pdb;pdb.set_trace() + gt_labels = gt_dict['gt_labels_3d'].data + gt_instances = gt_dict['gt_bboxes_3d'].data.instance_list + + gt_vectors = [] + + for gt_instance, gt_label in zip(gt_instances, gt_labels): + pts, pts_num = sample_pts_from_line(gt_instance, patch_size=self.patch_size) + gt_vectors.append({ + 'pts': pts, + 'pts_num': pts_num, + 'type': int(gt_label) + }) + vector_num_list = {} + for i in range(self.NUM_MAPCLASSES): + vector_num_list[i] = [] + for vec in gt_vectors: + if vector['pts_num'] >= 2: + vector_num_list[vector['type']].append((LineString(vector['pts'][:vector['pts_num']]), vector.get('confidence_level', 1))) + return gt_vectors + + def _evaluate_single(self, + result_path, + logger=None, + metric='chamfer', + result_name='pts_bbox'): + """Evaluation for a single model in nuScenes protocol. + + Args: + result_path (str): Path of the result file. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + metric (str): Metric name used for evaluation. Default: 'bbox'. + result_name (str): Result name in the metric prefix. + Default: 'pts_bbox'. + + Returns: + dict: Dictionary of evaluation details. + """ + from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import eval_map + from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import format_res_gt_by_classes + result_path = osp.abspath(result_path) + detail = dict() + + print('Formating results & gts by classes') + with open(result_path,'r') as f: + pred_results = json.load(f) + gen_results = pred_results['results'] + with open(self.map_ann_file,'r') as ann_f: + gt_anns = json.load(ann_f) + annotations = gt_anns['GTs'] + cls_gens, cls_gts = format_res_gt_by_classes(result_path, + gen_results, + annotations, + cls_names=self.MAPCLASSES, + num_pred_pts_per_instance=self.fixed_num, + eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag, + pc_range=self.pc_range) + + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['chamfer', 'iou'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + for metric in metrics: + print('-*'*10+f'use metric:{metric}'+'-*'*10) + + if metric == 'chamfer': + thresholds = [0.5,1.0,1.5] + elif metric == 'iou': + thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES)) + + for i, thr in enumerate(thresholds): + print('-*'*10+f'threshhold:{thr}'+'-*'*10) + mAP, cls_ap = eval_map( + gen_results, + annotations, + cls_gens, + cls_gts, + threshold=thr, + cls_names=self.MAPCLASSES, + logger=logger, + num_pred_pts_per_instance=self.fixed_num, + pc_range=self.pc_range, + metric=metric) + for j in range(self.NUM_MAPCLASSES): + cls_aps[i, j] = cls_ap[j]['ap'] + + for i, name in enumerate(self.MAPCLASSES): + print('{}: {}'.format(name, cls_aps.mean(0)[i])) + detail['NuscMap_{}/{}_AP'.format(metric,name)] = cls_aps.mean(0)[i] + print('map: {}'.format(cls_aps.mean(0).mean())) + detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean() + + for i, name in enumerate(self.MAPCLASSES): + for j, thr in enumerate(thresholds): + if metric == 'chamfer': + detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] + elif metric == 'iou': + if thr == 0.5 or thr == 0.75: + detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] + + return detail + + + def evaluate(self, + results, + metric='bbox', + logger=None, + jsonfile_prefix=None, + result_names=['pts_bbox'], + show=False, + out_dir=None, + pipeline=None): + """Evaluation in nuScenes protocol. + + Args: + results (list[dict]): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + show (bool): Whether to visualize. + Default: False. + out_dir (str): Path to save the visualization results. + Default: None. + pipeline (list[dict], optional): raw data loading for showing. + Default: None. + + Returns: + dict[str, float]: Results of each evaluation metric. + """ + result_files, tmp_dir = self.format_results(results, jsonfile_prefix) + + if isinstance(result_files, dict): + results_dict = dict() + for name in result_names: + print('Evaluating bboxes of {}'.format(name)) + ret_dict = self._evaluate_single(result_files[name], metric=metric) + results_dict.update(ret_dict) + elif isinstance(result_files, str): + results_dict = self._evaluate_single(result_files, metric=metric) + + if tmp_dir is not None: + tmp_dir.cleanup() + + if show: + self.show(results, out_dir, pipeline=pipeline) + return results_dict + + +def output_to_vecs(detection): + box3d = detection['boxes_3d'].numpy() + scores = detection['scores_3d'].numpy() + labels = detection['labels_3d'].numpy() + pts = detection['pts_3d'].numpy() + + vec_list = [] + for i in range(box3d.shape[0]): + vec = dict( + bbox = box3d[i], # xyxy + label=labels[i], + score=scores[i], + pts=pts[i], + ) + vec_list.append(vec) + return vec_list + +def sample_pts_from_line(line, + fixed_num=-1, + sample_dist=1, + normalize=False, + patch_size=None, + padding=False, + num_samples=250,): + if fixed_num < 0: + distances = np.arange(0, line.length, sample_dist) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + else: + # fixed number of points, so distance is line.length / fixed_num + distances = np.linspace(0, line.length, fixed_num) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + + if normalize: + sampled_points = sampled_points / np.array([patch_size[1], patch_size[0]]) + + num_valid = len(sampled_points) + + if not padding or fixed_num > 0: + # fixed num sample can return now! + return sampled_points, num_valid + + # fixed distance sampling need padding! + num_valid = len(sampled_points) + + if fixed_num < 0: + if num_valid < num_samples: + padding = np.zeros((num_samples - len(sampled_points), 2)) + sampled_points = np.concatenate([sampled_points, padding], axis=0) + else: + sampled_points = sampled_points[:num_samples, :] + num_valid = num_samples + + if normalize: + sampled_points = sampled_points / np.array([patch_size[1], patch_size[0]]) + num_valid = len(sampled_points) + + return sampled_points, num_valid diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..0c76b9998b60e523b1b42e1603d6669faca7681b --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py @@ -0,0 +1,777 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import mmcv +import numpy as np +import pyquaternion +import tempfile +import torch +import warnings +from nuscenes.utils.data_classes import Box as NuScenesBox +from os import path as osp + +from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr +from mmdet.datasets import DATASETS, CocoDataset +from mmdet3d.core import show_multi_modality_result +from mmdet3d.core.bbox import CameraInstance3DBoxes, get_box_type +from mmdet3d.datasets.pipelines import Compose +from mmdet3d.datasets.utils import extract_result_dict, get_loading_pipeline + + +@DATASETS.register_module() +class CustomNuScenesMonoDataset(CocoDataset): + r"""Monocular 3D detection on NuScenes Dataset. + This class serves as the API for experiments on the NuScenes Dataset. + Please refer to `NuScenes Dataset `_ + for data downloading. + Args: + ann_file (str): Path of annotation file. + data_root (str): Path of dataset root. + load_interval (int, optional): Interval of loading the dataset. It is + used to uniformly sample the dataset. Defaults to 1. + with_velocity (bool, optional): Whether include velocity prediction + into the experiments. Defaults to True. + modality (dict, optional): Modality to specify the sensor data used + as input. Defaults to None. + box_type_3d (str, optional): Type of 3D box of this dataset. + Based on the `box_type_3d`, the dataset will encapsulate the box + to its original format then converted them to `box_type_3d`. + Defaults to 'Camera' in this class. Available options includes. + - 'LiDAR': Box in LiDAR coordinates. + - 'Depth': Box in depth coordinates, usually for indoor dataset. + - 'Camera': Box in camera coordinates. + eval_version (str, optional): Configuration version of evaluation. + Defaults to 'detection_cvpr_2019'. + use_valid_flag (bool): Whether to use `use_valid_flag` key in the info + file as mask to filter gt_boxes and gt_names. Defaults to False. + version (str, optional): Dataset version. Defaults to 'v1.0-trainval'. + """ + CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', + 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', + 'barrier') + DefaultAttribute = { + 'car': 'vehicle.parked', + 'pedestrian': 'pedestrian.moving', + 'trailer': 'vehicle.parked', + 'truck': 'vehicle.parked', + 'bus': 'vehicle.moving', + 'motorcycle': 'cycle.without_rider', + 'construction_vehicle': 'vehicle.parked', + 'bicycle': 'cycle.without_rider', + 'barrier': '', + 'traffic_cone': '', + } + # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa + ErrNameMapping = { + 'trans_err': 'mATE', + 'scale_err': 'mASE', + 'orient_err': 'mAOE', + 'vel_err': 'mAVE', + 'attr_err': 'mAAE' + } + + def __init__(self, + data_root, + load_interval=1, + with_velocity=True, + modality=None, + box_type_3d='Camera', + eval_version='detection_cvpr_2019', + use_valid_flag=False, + overlap_test=False, + version='v1.0-trainval', + **kwargs): + super().__init__(**kwargs) + # overlap_test = True + self.data_root = data_root + self.overlap_test = overlap_test + self.load_interval = load_interval + self.with_velocity = with_velocity + self.modality = modality + self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) + self.eval_version = eval_version + self.use_valid_flag = use_valid_flag + self.bbox_code_size = 9 + self.version = version + if self.eval_version is not None: + from nuscenes.eval.detection.config import config_factory + self.eval_detection_configs = config_factory(self.eval_version) + if self.modality is None: + self.modality = dict( + use_camera=True, + use_lidar=False, + use_radar=False, + use_map=False, + use_external=False) + + def pre_pipeline(self, results): + """Initialization before data preparation. + Args: + results (dict): Dict before data preprocessing. + - img_fields (list): Image fields. + - bbox3d_fields (list): 3D bounding boxes fields. + - pts_mask_fields (list): Mask fields of points. + - pts_seg_fields (list): Mask fields of point segments. + - bbox_fields (list): Fields of bounding boxes. + - mask_fields (list): Fields of masks. + - seg_fields (list): Segment fields. + - box_type_3d (str): 3D box type. + - box_mode_3d (str): 3D box mode. + """ + results['img_prefix'] = '' # self.img_prefix + # print('img_prefix', self.img_prefix) + results['seg_prefix'] = self.seg_prefix + results['proposal_file'] = self.proposal_file + results['img_fields'] = [] + results['bbox3d_fields'] = [] + results['pts_mask_fields'] = [] + results['pts_seg_fields'] = [] + results['bbox_fields'] = [] + results['mask_fields'] = [] + results['seg_fields'] = [] + results['box_type_3d'] = self.box_type_3d + results['box_mode_3d'] = self.box_mode_3d + + def _parse_ann_info(self, img_info, ann_info): + """Parse bbox annotation. + Args: + img_info (list[dict]): Image info. + ann_info (list[dict]): Annotation info of an image. + Returns: + dict: A dict containing the following keys: bboxes, labels, \ + gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \ + depths, bboxes_ignore, masks, seg_map + """ + gt_bboxes = [] + gt_labels = [] + attr_labels = [] + gt_bboxes_ignore = [] + gt_masks_ann = [] + gt_bboxes_cam3d = [] + centers2d = [] + depths = [] + for i, ann in enumerate(ann_info): + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) + inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + if ann['category_id'] not in self.cat_ids: + continue + bbox = [x1, y1, x1 + w, y1 + h] + if ann.get('iscrowd', False): + gt_bboxes_ignore.append(bbox) + else: + gt_bboxes.append(bbox) + gt_labels.append(self.cat2label[ann['category_id']]) + attr_labels.append(ann['attribute_id']) + gt_masks_ann.append(ann.get('segmentation', None)) + # 3D annotations in camera coordinates + bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1) + velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2) + nan_mask = np.isnan(velo_cam3d[:, 0]) + velo_cam3d[nan_mask] = [0.0, 0.0] + bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1) + gt_bboxes_cam3d.append(bbox_cam3d.squeeze()) + # 2.5D annotations in camera coordinates + center2d = ann['center2d'][:2] + depth = ann['center2d'][2] + centers2d.append(center2d) + depths.append(depth) + + if gt_bboxes: + gt_bboxes = np.array(gt_bboxes, dtype=np.float32) + gt_labels = np.array(gt_labels, dtype=np.int64) + attr_labels = np.array(attr_labels, dtype=np.int64) + else: + gt_bboxes = np.zeros((0, 4), dtype=np.float32) + gt_labels = np.array([], dtype=np.int64) + attr_labels = np.array([], dtype=np.int64) + + if gt_bboxes_cam3d: + gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32) + centers2d = np.array(centers2d, dtype=np.float32) + depths = np.array(depths, dtype=np.float32) + else: + gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size), + dtype=np.float32) + centers2d = np.zeros((0, 2), dtype=np.float32) + depths = np.zeros((0), dtype=np.float32) + + gt_bboxes_cam3d = CameraInstance3DBoxes( + gt_bboxes_cam3d, + box_dim=gt_bboxes_cam3d.shape[-1], + origin=(0.5, 0.5, 0.5)) + gt_labels_3d = copy.deepcopy(gt_labels) + + if gt_bboxes_ignore: + gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) + else: + gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) + + seg_map = img_info['filename'].replace('jpg', 'png') + + ann = dict( + bboxes=gt_bboxes, + labels=gt_labels, + gt_bboxes_3d=gt_bboxes_cam3d, + gt_labels_3d=gt_labels_3d, + attr_labels=attr_labels, + centers2d=centers2d, + depths=depths, + bboxes_ignore=gt_bboxes_ignore, + masks=gt_masks_ann, + seg_map=seg_map) + + return ann + + def get_attr_name(self, attr_idx, label_name): + """Get attribute from predicted index. + This is a workaround to predict attribute when the predicted velocity + is not reliable. We map the predicted attribute index to the one + in the attribute set. If it is consistent with the category, we will + keep it. Otherwise, we will use the default attribute. + Args: + attr_idx (int): Attribute index. + label_name (str): Predicted category name. + Returns: + str: Predicted attribute name. + """ + # TODO: Simplify the variable name + AttrMapping_rev2 = [ + 'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', + 'pedestrian.standing', 'pedestrian.sitting_lying_down', + 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None' + ] + if label_name == 'car' or label_name == 'bus' \ + or label_name == 'truck' or label_name == 'trailer' \ + or label_name == 'construction_vehicle': + if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \ + AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \ + AttrMapping_rev2[attr_idx] == 'vehicle.stopped': + return AttrMapping_rev2[attr_idx] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + elif label_name == 'pedestrian': + if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \ + AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \ + AttrMapping_rev2[attr_idx] == \ + 'pedestrian.sitting_lying_down': + return AttrMapping_rev2[attr_idx] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + elif label_name == 'bicycle' or label_name == 'motorcycle': + if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \ + AttrMapping_rev2[attr_idx] == 'cycle.without_rider': + return AttrMapping_rev2[attr_idx] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + else: + return CustomNuScenesMonoDataset.DefaultAttribute[label_name] + + def _format_bbox(self, results, jsonfile_prefix=None): + """Convert the results to the standard format. + Args: + results (list[dict]): Testing results of the dataset. + jsonfile_prefix (str): The prefix of the output jsonfile. + You can specify the output directory/filename by + modifying the jsonfile_prefix. Default: None. + Returns: + str: Path of the output json file. + """ + nusc_annos = {} + mapped_class_names = self.CLASSES + + print('Start to convert detection format...') + + CAM_NUM = 6 + + for sample_id, det in enumerate(mmcv.track_iter_progress(results)): + + if sample_id % CAM_NUM == 0: + boxes_per_frame = [] + attrs_per_frame = [] + + # need to merge results from images of the same sample + annos = [] + boxes, attrs = output_to_nusc_box(det) + sample_token = self.data_infos[sample_id]['token'] + boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id], + boxes, attrs, + mapped_class_names, + self.eval_detection_configs, + self.eval_version) + + boxes_per_frame.extend(boxes) + attrs_per_frame.extend(attrs) + # Remove redundant predictions caused by overlap of images + if (sample_id + 1) % CAM_NUM != 0: + continue + boxes = global_nusc_box_to_cam( + self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame, + mapped_class_names, self.eval_detection_configs, + self.eval_version) + cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes) + # box nms 3d over 6 images in a frame + # TODO: move this global setting into config + nms_cfg = dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=4096, + nms_thr=0.05, + score_thr=0.01, + min_bbox_size=0, + max_per_frame=500) + from mmcv import Config + nms_cfg = Config(nms_cfg) + cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev) + boxes3d = cam_boxes3d.tensor + # generate attr scores from attr labels + attrs = labels.new_tensor([attr for attr in attrs_per_frame]) + boxes3d, scores, labels, attrs = box3d_multiclass_nms( + boxes3d, + cam_boxes3d_for_nms, + scores, + nms_cfg.score_thr, + nms_cfg.max_per_frame, + nms_cfg, + mlvl_attr_scores=attrs) + cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9) + det = bbox3d2result(cam_boxes3d, scores, labels, attrs) + boxes, attrs = output_to_nusc_box(det) + boxes, attrs = cam_nusc_box_to_global( + self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs, + mapped_class_names, self.eval_detection_configs, + self.eval_version) + + for i, box in enumerate(boxes): + name = mapped_class_names[box.label] + attr = self.get_attr_name(attrs[i], name) + nusc_anno = dict( + sample_token=sample_token, + translation=box.center.tolist(), + size=box.wlh.tolist(), + rotation=box.orientation.elements.tolist(), + velocity=box.velocity[:2].tolist(), + detection_name=name, + detection_score=box.score, + attribute_name=attr) + annos.append(nusc_anno) + # other views results of the same frame should be concatenated + if sample_token in nusc_annos: + nusc_annos[sample_token].extend(annos) + else: + nusc_annos[sample_token] = annos + + nusc_submissions = { + 'meta': self.modality, + 'results': nusc_annos, + } + + mmcv.mkdir_or_exist(jsonfile_prefix) + res_path = osp.join(jsonfile_prefix, 'results_nusc.json') + print('Results writes to', res_path) + mmcv.dump(nusc_submissions, res_path) + return res_path + + def _evaluate_single(self, + result_path, + logger=None, + metric='bbox', + result_name='img_bbox'): + """Evaluation for a single model in nuScenes protocol. + Args: + result_path (str): Path of the result file. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + metric (str): Metric name used for evaluation. Default: 'bbox'. + result_name (str): Result name in the metric prefix. + Default: 'img_bbox'. + Returns: + dict: Dictionary of evaluation details. + """ + from nuscenes import NuScenes + #from nuscenes.eval.detection.evaluate import NuScenesEval + from .nuscnes_eval import NuScenesEval_custom + output_dir = osp.join(*osp.split(result_path)[:-1]) + self.nusc = NuScenes( + version=self.version, dataroot=self.data_root, verbose=False) + eval_set_map = { + 'v1.0-mini': 'mini_val', + 'v1.0-trainval': 'val', + } + # nusc_eval = NuScenesEval( + # nusc, + # config=self.eval_detection_configs, + # result_path=result_path, + # eval_set=eval_set_map[self.version], + # output_dir=output_dir, + # verbose=False) + self.nusc_eval = NuScenesEval_custom( + self.nusc, + config=self.eval_detection_configs, + result_path=result_path, + eval_set=eval_set_map[self.version], + output_dir=output_dir, + verbose=True, + overlap_test=self.overlap_test, + data_infos=self.data_infos + ) + + self.nusc_eval.main(render_curves=True) + + # record metrics + metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) + detail = dict() + metric_prefix = f'{result_name}_NuScenes' + for name in self.CLASSES: + for k, v in metrics['label_aps'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['label_tp_errors'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['tp_errors'].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}'.format(metric_prefix, + self.ErrNameMapping[k])] = val + + detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] + detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] + return detail + + def format_results(self, results, jsonfile_prefix=None, **kwargs): + """Format the results to json (standard format for COCO evaluation). + Args: + results (list[tuple | numpy.ndarray]): Testing results of the + dataset. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + Returns: + tuple: (result_files, tmp_dir), result_files is a dict containing \ + the json filepaths, tmp_dir is the temporal directory created \ + for saving json files when jsonfile_prefix is not specified. + """ + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: {} != {}'. + format(len(results), len(self))) + + if jsonfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + jsonfile_prefix = osp.join(tmp_dir.name, 'results') + else: + tmp_dir = None + + # currently the output prediction results could be in two formats + # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) + # 2. list of dict('pts_bbox' or 'img_bbox': + # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) + # this is a workaround to enable evaluation of both formats on nuScenes + # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 + if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): + result_files = self._format_bbox(results, jsonfile_prefix) + else: + # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict + result_files = dict() + for name in results[0]: + # not evaluate 2D predictions on nuScenes + if '2d' in name: + continue + print(f'\nFormating bboxes of {name}') + results_ = [out[name] for out in results] + tmp_file_ = osp.join(jsonfile_prefix, name) + result_files.update( + {name: self._format_bbox(results_, tmp_file_)}) + + return result_files, tmp_dir + + def evaluate(self, + results, + metric='bbox', + logger=None, + jsonfile_prefix=None, + result_names=['img_bbox'], + show=False, + out_dir=None, + pipeline=None): + """Evaluation in nuScenes protocol. + Args: + results (list[dict]): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + show (bool): Whether to visualize. + Default: False. + out_dir (str): Path to save the visualization results. + Default: None. + pipeline (list[dict], optional): raw data loading for showing. + Default: None. + Returns: + dict[str, float]: Results of each evaluation metric. + """ + + result_files, tmp_dir = self.format_results(results, jsonfile_prefix) + + if isinstance(result_files, dict): + results_dict = dict() + for name in result_names: + print('Evaluating bboxes of {}'.format(name)) + ret_dict = self._evaluate_single(result_files[name]) + results_dict.update(ret_dict) + elif isinstance(result_files, str): + results_dict = self._evaluate_single(result_files) + + if tmp_dir is not None: + tmp_dir.cleanup() + + if show: + self.show(results, out_dir, pipeline=pipeline) + return results_dict + + def _extract_data(self, index, pipeline, key, load_annos=False): + """Load data using input pipeline and extract data according to key. + Args: + index (int): Index for accessing the target data. + pipeline (:obj:`Compose`): Composed data loading pipeline. + key (str | list[str]): One single or a list of data key. + load_annos (bool): Whether to load data annotations. + If True, need to set self.test_mode as False before loading. + Returns: + np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]: + A single or a list of loaded data. + """ + assert pipeline is not None, 'data loading pipeline is not provided' + img_info = self.data_infos[index] + input_dict = dict(img_info=img_info) + + if load_annos: + ann_info = self.get_ann_info(index) + input_dict.update(dict(ann_info=ann_info)) + + self.pre_pipeline(input_dict) + example = pipeline(input_dict) + + # extract data items according to keys + if isinstance(key, str): + data = extract_result_dict(example, key) + else: + data = [extract_result_dict(example, k) for k in key] + + return data + + def _get_pipeline(self, pipeline): + """Get data loading pipeline in self.show/evaluate function. + Args: + pipeline (list[dict] | None): Input pipeline. If None is given, \ + get from self.pipeline. + """ + if pipeline is None: + if not hasattr(self, 'pipeline') or self.pipeline is None: + warnings.warn( + 'Use default pipeline for data loading, this may cause ' + 'errors when data is on ceph') + return self._build_default_pipeline() + loading_pipeline = get_loading_pipeline(self.pipeline.transforms) + return Compose(loading_pipeline) + return Compose(pipeline) + + def _build_default_pipeline(self): + """Build the default pipeline for this dataset.""" + pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='DefaultFormatBundle3D', + class_names=self.CLASSES, + with_label=False), + dict(type='Collect3D', keys=['img']) + ] + return Compose(pipeline) + + def show(self, results, out_dir, show=True, pipeline=None): + """Results visualization. + Args: + results (list[dict]): List of bounding boxes results. + out_dir (str): Output directory of visualization result. + show (bool): Visualize the results online. + pipeline (list[dict], optional): raw data loading for showing. + Default: None. + """ + assert out_dir is not None, 'Expect out_dir, got none.' + pipeline = self._get_pipeline(pipeline) + for i, result in enumerate(results): + if 'img_bbox' in result.keys(): + result = result['img_bbox'] + data_info = self.data_infos[i] + img_path = data_info['file_name'] + file_name = osp.split(img_path)[-1].split('.')[0] + img, img_metas = self._extract_data(i, pipeline, + ['img', 'img_metas']) + # need to transpose channel to first dim + img = img.numpy().transpose(1, 2, 0) + gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'] + pred_bboxes = result['boxes_3d'] + show_multi_modality_result( + img, + gt_bboxes, + pred_bboxes, + img_metas['cam2img'], + out_dir, + file_name, + box_mode='camera', + show=show) + + +def output_to_nusc_box(detection): + """Convert the output to the box class in the nuScenes. + Args: + detection (dict): Detection results. + - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. + - scores_3d (torch.Tensor): Detection scores. + - labels_3d (torch.Tensor): Predicted box labels. + - attrs_3d (torch.Tensor, optional): Predicted attributes. + Returns: + list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes. + """ + box3d = detection['boxes_3d'] + scores = detection['scores_3d'].numpy() + labels = detection['labels_3d'].numpy() + attrs = None + if 'attrs_3d' in detection: + attrs = detection['attrs_3d'].numpy() + + box_gravity_center = box3d.gravity_center.numpy() + box_dims = box3d.dims.numpy() + box_yaw = box3d.yaw.numpy() + + # convert the dim/rot to nuscbox convention + box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]] + box_yaw = -box_yaw + + box_list = [] + for i in range(len(box3d)): + q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) + q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2) + quat = q2 * q1 + velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8]) + box = NuScenesBox( + box_gravity_center[i], + box_dims[i], + quat, + label=labels[i], + score=scores[i], + velocity=velocity) + box_list.append(box) + return box_list, attrs + + +def cam_nusc_box_to_global(info, + boxes, + attrs, + classes, + eval_configs, + eval_version='detection_cvpr_2019'): + """Convert the box from camera to global coordinate. + Args: + info (dict): Info for a specific sample data, including the + calibration information. + boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. + classes (list[str]): Mapped classes in the evaluation. + eval_configs (object): Evaluation configuration object. + eval_version (str): Evaluation version. + Default: 'detection_cvpr_2019' + Returns: + list: List of standard NuScenesBoxes in the global + coordinate. + """ + box_list = [] + attr_list = [] + for (box, attr) in zip(boxes, attrs): + # Move box to ego vehicle coord system + box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation'])) + box.translate(np.array(info['cam2ego_translation'])) + # filter det in ego. + cls_range_map = eval_configs.class_range + radius = np.linalg.norm(box.center[:2], 2) + det_range = cls_range_map[classes[box.label]] + if radius > det_range: + continue + # Move box to global coord system + box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) + box.translate(np.array(info['ego2global_translation'])) + box_list.append(box) + attr_list.append(attr) + return box_list, attr_list + + +def global_nusc_box_to_cam(info, + boxes, + classes, + eval_configs, + eval_version='detection_cvpr_2019'): + """Convert the box from global to camera coordinate. + Args: + info (dict): Info for a specific sample data, including the + calibration information. + boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. + classes (list[str]): Mapped classes in the evaluation. + eval_configs (object): Evaluation configuration object. + eval_version (str): Evaluation version. + Default: 'detection_cvpr_2019' + Returns: + list: List of standard NuScenesBoxes in the global + coordinate. + """ + box_list = [] + for box in boxes: + # Move box to ego vehicle coord system + box.translate(-np.array(info['ego2global_translation'])) + box.rotate( + pyquaternion.Quaternion(info['ego2global_rotation']).inverse) + # filter det in ego. + cls_range_map = eval_configs.class_range + radius = np.linalg.norm(box.center[:2], 2) + det_range = cls_range_map[classes[box.label]] + if radius > det_range: + continue + # Move box to camera coord system + box.translate(-np.array(info['cam2ego_translation'])) + box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse) + box_list.append(box) + return box_list + + +def nusc_box_to_cam_box3d(boxes): + """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`. + Args: + boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. + Returns: + tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \ + Converted 3D bounding boxes, scores and labels. + """ + locs = torch.Tensor([b.center for b in boxes]).view(-1, 3) + dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3) + rots = torch.Tensor([b.orientation.yaw_pitch_roll[0] + for b in boxes]).view(-1, 1) + velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2) + + # convert nusbox to cambox convention + dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]] + rots = -rots + + boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda() + cam_boxes3d = CameraInstance3DBoxes( + boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5)) + scores = torch.Tensor([b.score for b in boxes]).cuda() + labels = torch.LongTensor([b.label for b in boxes]).cuda() + nms_scores = scores.new_zeros(scores.shape[0], 10 + 1) + indices = labels.new_tensor(list(range(scores.shape[0]))) + nms_scores[indices, labels] = scores + return cam_boxes3d, nms_scores, labels \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscnes_eval.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscnes_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ae34cde3fdc48981619f3fac4e65f599c8295f --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscnes_eval.py @@ -0,0 +1,751 @@ +import argparse +import copy +import json +import os +import time +from typing import Tuple, Dict, Any +import torch +import numpy as np + +from nuscenes import NuScenes +from nuscenes.eval.common.config import config_factory +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.detection.data_classes import DetectionConfig +from nuscenes.eval.detection.evaluate import NuScenesEval +from pyquaternion import Quaternion + +from nuscenes import NuScenes +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.detection.data_classes import DetectionBox +from nuscenes.eval.detection.utils import category_to_detection_name +from nuscenes.eval.tracking.data_classes import TrackingBox +from nuscenes.utils.data_classes import Box +from nuscenes.utils.geometry_utils import points_in_box +from nuscenes.utils.splits import create_splits_scenes +from nuscenes.eval.common.loaders import load_prediction, add_center_dist, filter_eval_boxes +import tqdm +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix +from torchvision.transforms.functional import rotate +import pycocotools.mask as mask_util +# from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from torchvision.transforms.functional import rotate +import cv2 +import argparse +import json +import os +import random +import time +from typing import Tuple, Dict, Any + +import numpy as np + +from nuscenes import NuScenes +from nuscenes.eval.common.config import config_factory +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes +from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp +from nuscenes.eval.detection.constants import TP_METRICS +from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \ + DetectionMetricDataList +from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D +from IPython import embed +import json +from typing import Any + +import numpy as np +from matplotlib import pyplot as plt + +from nuscenes import NuScenes +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.common.render import setup_axis +from nuscenes.eval.common.utils import boxes_to_sensor +from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \ + PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS +from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList +from nuscenes.utils.data_classes import LidarPointCloud +from nuscenes.utils.geometry_utils import view_points + + + +Axis = Any + +def class_tp_curve(md_list: DetectionMetricDataList, + metrics: DetectionMetrics, + detection_name: str, + min_recall: float, + dist_th_tp: float, + savepath: str = None, + ax: Axis = None) -> None: + """ + Plot the true positive curve for the specified class. + :param md_list: DetectionMetricDataList instance. + :param metrics: DetectionMetrics instance. + :param detection_name: + :param min_recall: Minimum recall value. + :param dist_th_tp: The distance threshold used to determine matches. + :param savepath: If given, saves the the rendering here instead of displaying. + :param ax: Axes onto which to render. + """ + # Get metric data for given detection class with tp distance threshold. + + md = md_list[(detection_name, dist_th_tp)] + min_recall_ind = round(100 * min_recall) + if min_recall_ind <= md.max_recall_ind: + # For traffic_cone and barrier only a subset of the metrics are plotted. + rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))] + ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1 + else: + ylimit = 1.0 + + # Prepare axis. + if ax is None: + ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1, + min_recall=min_recall) + ax.set_ylim(0, ylimit) + + # Plot the recall vs. error curve for each tp metric. + for metric in TP_METRICS: + tp = metrics.get_label_tp(detection_name, metric) + + # Plot only if we have valid data. + if tp is not np.nan and min_recall_ind <= md.max_recall_ind: + recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1] + else: + recall, error = [], [] + + # Change legend based on tp value + if tp is np.nan: + label = '{}: n/a'.format(PRETTY_TP_METRICS[metric]) + elif min_recall_ind > md.max_recall_ind: + label = '{}: nan'.format(PRETTY_TP_METRICS[metric]) + else: + label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric]) + if metric == 'trans_err': + label += f' ({md.max_recall_ind})' # add recall + print(f'Recall: {detection_name}: {md.max_recall_ind/100}') + ax.plot(recall, error, label=label) + ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3)) + ax.legend(loc='best') + + if savepath is not None: + plt.savefig(savepath) + plt.close() + + +class DetectionBox_modified(DetectionBox): + def __init__(self, *args, token=None, visibility=None, index=None, **kwargs): + ''' + add annotation token + ''' + super().__init__(*args, **kwargs) + self.token = token + self.visibility = visibility + self.index = index + + def serialize(self) -> dict: + """ Serialize instance into json-friendly format. """ + return { + 'token': self.token, + 'sample_token': self.sample_token, + 'translation': self.translation, + 'size': self.size, + 'rotation': self.rotation, + 'velocity': self.velocity, + 'ego_translation': self.ego_translation, + 'num_pts': self.num_pts, + 'detection_name': self.detection_name, + 'detection_score': self.detection_score, + 'attribute_name': self.attribute_name, + 'visibility': self.visibility, + 'index': self.index + + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized content. """ + return cls( + token=content['token'], + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name'], + visibility=content['visibility'], + index=content['index'], + ) + + +def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool: + """ + Check if a box is visible inside an image without accounting for occlusions. + :param box: The box to be checked. + :param intrinsic: . Intrinsic camera matrix. + :param imsize: (width, height). + :param vis_level: One of the enumerations of . + :return True if visibility condition is satisfied. + """ + + center_3d = box.center.reshape(3, 1) + center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :] + + visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0]) + visible = np.logical_and(visible, center_img[1, :] < imsize[1]) + visible = np.logical_and(visible, center_img[1, :] > 0) + visible = np.logical_and(visible, center_3d[2, :] > 1) + + in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. + + if vis_level == BoxVisibility.ALL: + return all(visible) and all(in_front) + elif vis_level == BoxVisibility.ANY: + return any(visible) and all(in_front) + elif vis_level == BoxVisibility.NONE: + return True + else: + raise ValueError("vis_level: {} not valid".format(vis_level)) + + +def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int], + vis_level: int = BoxVisibility.ANY) -> bool: + """ + Check if a box is visible in images but not all corners in image . + :param box: The box to be checked. + :param intrinsic: . Intrinsic camera matrix. + :param imsize: (width, height). + :param vis_level: One of the enumerations of . + :return True if visibility condition is satisfied. + """ + + corners_3d = box.corners() + corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :] + + visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0]) + visible = np.logical_and(visible, corners_img[1, :] < imsize[1]) + visible = np.logical_and(visible, corners_img[1, :] > 0) + visible = np.logical_and(visible, corners_3d[2, :] > 1) + + in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. + + if any(visible) and not all(visible) and all(in_front): + return True + else: + return False + + +def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False): + """ + Loads ground truth boxes from DB. + :param nusc: A NuScenes instance. + :param eval_split: The evaluation split for which we load GT boxes. + :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. + :param verbose: Whether to print messages to stdout. + :return: The GT boxes. + """ + + # Init. + if box_cls == DetectionBox_modified: + attribute_map = {a['token']: a['name'] for a in nusc.attribute} + + if verbose: + print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version)) + # Read out all sample_tokens in DB. + sample_tokens_all = [s['token'] for s in nusc.sample] + assert len(sample_tokens_all) > 0, "Error: Database has no samples!" + + # Only keep samples from this split. + splits = create_splits_scenes() + + # Check compatibility of split with nusc_version. + version = nusc.version + if eval_split in {'train', 'val', 'train_detect', 'train_track'}: + assert version.endswith('trainval'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + elif eval_split in {'mini_train', 'mini_val'}: + assert version.endswith('mini'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + elif eval_split == 'test': + assert version.endswith('test'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + else: + raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.' + .format(eval_split)) + + if eval_split == 'test': + # Check that you aren't trying to cheat :). + assert len(nusc.sample_annotation) > 0, \ + 'Error: You are trying to evaluate on the test set but you do not have the annotations!' + index_map = {} + for scene in nusc.scene: + first_sample_token = scene['first_sample_token'] + sample = nusc.get('sample', first_sample_token) + index_map[first_sample_token] = 1 + index = 2 + while sample['next'] != '': + sample = nusc.get('sample', sample['next']) + index_map[sample['token']] = index + index += 1 + + sample_tokens = [] + for sample_token in sample_tokens_all: + scene_token = nusc.get('sample', sample_token)['scene_token'] + scene_record = nusc.get('scene', scene_token) + if scene_record['name'] in splits[eval_split]: + sample_tokens.append(sample_token) + + all_annotations = EvalBoxes() + + # Load annotations and filter predictions and annotations. + tracking_id_set = set() + for sample_token in tqdm.tqdm(sample_tokens, leave=verbose): + + sample = nusc.get('sample', sample_token) + sample_annotation_tokens = sample['anns'] + + sample_boxes = [] + for sample_annotation_token in sample_annotation_tokens: + + sample_annotation = nusc.get('sample_annotation', sample_annotation_token) + if box_cls == DetectionBox_modified: + # Get label name in detection task and filter unused labels. + detection_name = category_to_detection_name(sample_annotation['category_name']) + if detection_name is None: + continue + + # Get attribute_name. + attr_tokens = sample_annotation['attribute_tokens'] + attr_count = len(attr_tokens) + if attr_count == 0: + attribute_name = '' + elif attr_count == 1: + attribute_name = attribute_map[attr_tokens[0]] + else: + raise Exception('Error: GT annotations must not have more than one attribute!') + + sample_boxes.append( + box_cls( + token=sample_annotation_token, + sample_token=sample_token, + translation=sample_annotation['translation'], + size=sample_annotation['size'], + rotation=sample_annotation['rotation'], + velocity=nusc.box_velocity(sample_annotation['token'])[:2], + num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], + detection_name=detection_name, + detection_score=-1.0, # GT samples do not have a score. + attribute_name=attribute_name, + visibility=sample_annotation['visibility_token'], + index=index_map[sample_token] + ) + ) + elif box_cls == TrackingBox: + assert False + else: + raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls) + + all_annotations.add_boxes(sample_token, sample_boxes) + + if verbose: + print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens))) + + return all_annotations + + +def filter_eval_boxes_by_id(nusc: NuScenes, + eval_boxes: EvalBoxes, + id=None, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. Distance, bike-racks and points per box. + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param is: the anns token set that used to keep bboxes. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + + # Filter on anns + total += len(eval_boxes[sample_token]) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + if box.token in id: + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After anns based filtering: %d" % anns_filter) + + return eval_boxes + + +def filter_eval_boxes_by_visibility( + ori_eval_boxes: EvalBoxes, + visibility=None, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. Distance, bike-racks and points per box. + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param is: the anns token set that used to keep bboxes. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + eval_boxes = copy.deepcopy(ori_eval_boxes) + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + # Filter on anns + total += len(eval_boxes[sample_token]) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + if box.visibility == visibility: + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After visibility based filtering: %d" % anns_filter) + + return eval_boxes + + +def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False): + eval_boxes = copy.deepcopy(ori_eval_boxes) + for sample_token in eval_boxes.sample_tokens: + if sample_token not in valid_sample_tokens: + eval_boxes.boxes.pop(sample_token) + return eval_boxes + + +def filter_eval_boxes_by_overlap(nusc: NuScenes, + eval_boxes: EvalBoxes, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. basedon overlap . + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + cams = ['CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_RIGHT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + + # Filter on anns + total += len(eval_boxes[sample_token]) + sample_record = nusc.get('sample', sample_token) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + count = 0 + for cam in cams: + ''' + copy-paste form nuscens + ''' + sample_data_token = sample_record['data'][cam] + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + new_box = Box(box.translation, box.size, Quaternion(box.rotation), + name=box.detection_name, token='') + + # Move box to ego vehicle coord system. + new_box.translate(-np.array(pose_record['translation'])) + new_box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + new_box.translate(-np.array(cs_record['translation'])) + new_box.rotate(Quaternion(cs_record['rotation']).inverse) + + if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): + count += 1 + # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): + # count += 1 + + if count > 1: + with open('center_overlap.txt', 'a') as f: + try: + f.write(box.token + '\n') + except: + pass + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + verbose = True + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After anns based filtering: %d" % anns_filter) + + return eval_boxes + + +class NuScenesEval_custom(NuScenesEval): + """ + Dummy class for backward-compatibility. Same as DetectionEval. + """ + + def __init__(self, + nusc: NuScenes, + config: DetectionConfig, + result_path: str, + eval_set: str, + output_dir: str = None, + verbose: bool = True, + overlap_test=False, + eval_mask=False, + data_infos=None + ): + """ + Initialize a DetectionEval object. + :param nusc: A NuScenes object. + :param config: A DetectionConfig object. + :param result_path: Path of the nuScenes JSON result file. + :param eval_set: The dataset split to evaluate on, e.g. train, val or test. + :param output_dir: Folder to save plots and results to. + :param verbose: Whether to print to stdout. + """ + + self.nusc = nusc + self.result_path = result_path + self.eval_set = eval_set + self.output_dir = output_dir + self.verbose = verbose + self.cfg = config + self.overlap_test = overlap_test + self.eval_mask = eval_mask + self.data_infos = data_infos + # Check result file exists. + assert os.path.exists(result_path), 'Error: The result file does not exist!' + + # Make dirs. + self.plot_dir = os.path.join(self.output_dir, 'plots') + if not os.path.isdir(self.output_dir): + os.makedirs(self.output_dir) + if not os.path.isdir(self.plot_dir): + os.makedirs(self.plot_dir) + + # Load data. + if verbose: + print('Initializing nuScenes detection evaluation') + self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox, + verbose=verbose) + self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose) + + assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \ + "Samples in split doesn't match samples in predictions." + + # Add center distances. + self.pred_boxes = add_center_dist(nusc, self.pred_boxes) + self.gt_boxes = add_center_dist(nusc, self.gt_boxes) + + # Filter boxes (distance, points per box, etc.). + + if verbose: + print('Filtering predictions') + self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose) + if verbose: + print('Filtering ground truth annotations') + self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose) + + if self.overlap_test: + self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes) + + self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True) + + self.all_gt = copy.deepcopy(self.gt_boxes) + self.all_preds = copy.deepcopy(self.pred_boxes) + self.sample_tokens = self.gt_boxes.sample_tokens + + self.index_map = {} + for scene in nusc.scene: + first_sample_token = scene['first_sample_token'] + sample = nusc.get('sample', first_sample_token) + self.index_map[first_sample_token] = 1 + index = 2 + while sample['next'] != '': + sample = nusc.get('sample', sample['next']) + self.index_map[sample['token']] = index + index += 1 + + def update_gt(self, type_='vis', visibility='1', index=1): + if type_ == 'vis': + self.visibility_test = True + if self.visibility_test: + '''[{'description': 'visibility of whole object is between 0 and 40%', + 'token': '1', + 'level': 'v0-40'}, + {'description': 'visibility of whole object is between 40 and 60%', + 'token': '2', + 'level': 'v40-60'}, + {'description': 'visibility of whole object is between 60 and 80%', + 'token': '3', + 'level': 'v60-80'}, + {'description': 'visibility of whole object is between 80 and 100%', + 'token': '4', + 'level': 'v80-100'}]''' + + self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True) + + elif type_ == 'ord': + + valid_tokens = [key for (key, value) in self.index_map.items() if value == index] + # from IPython import embed + # embed() + self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens) + self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens) + self.sample_tokens = self.gt_boxes.sample_tokens + + + def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]: + """ + Performs the actual evaluation. + :return: A tuple of high-level and the raw metric data. + """ + start_time = time.time() + + # ----------------------------------- + # Step 1: Accumulate metric data for all classes and distance thresholds. + # ----------------------------------- + if self.verbose: + print('Accumulating metric data...') + metric_data_list = DetectionMetricDataList() + + # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths) + # self.cfg.dist_ths = [0.3] + # self.cfg.dist_fcn_callable + for class_name in self.cfg.class_names: + for dist_th in self.cfg.dist_ths: + md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th) + metric_data_list.set(class_name, dist_th, md) + + # ----------------------------------- + # Step 2: Calculate metrics from the data. + # ----------------------------------- + if self.verbose: + print('Calculating metrics...') + metrics = DetectionMetrics(self.cfg) + for class_name in self.cfg.class_names: + # Compute APs. + for dist_th in self.cfg.dist_ths: + metric_data = metric_data_list[(class_name, dist_th)] + ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision) + metrics.add_label_ap(class_name, dist_th, ap) + # Compute TP metrics. + for metric_name in TP_METRICS: + metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)] + if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']: + tp = np.nan + elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']: + tp = np.nan + else: + tp = calc_tp(metric_data, self.cfg.min_recall, metric_name) + metrics.add_label_tp(class_name, metric_name, tp) + + # Compute evaluation time. + metrics.add_runtime(time.time() - start_time) + + return metrics, metric_data_list + + def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None: + """ + Renders various PR and TP curves. + :param metrics: DetectionMetrics instance. + :param md_list: DetectionMetricDataList instance. + """ + if self.verbose: + print('Rendering PR and TP curves') + + def savepath(name): + return os.path.join(self.plot_dir, name + '.pdf') + + summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall, + dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary')) + + for detection_name in self.cfg.class_names: + class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall, + savepath=savepath(detection_name + '_pr')) + + class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp, + savepath=savepath(detection_name + '_tp')) + + for dist_th in self.cfg.dist_ths: + dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall, + savepath=savepath('dist_pr_' + str(dist_th))) + + +if __name__ == "__main__": + + # Settings. + parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('result_path', type=str, help='The submission as a JSON file.') + parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics', + help='Folder to store result metrics, graphs and example visualizations.') + parser.add_argument('--eval_set', type=str, default='val', + help='Which dataset split to evaluate on, train, val or test.') + parser.add_argument('--dataroot', type=str, default='data/nuscenes', + help='Default nuScenes data directory.') + parser.add_argument('--version', type=str, default='v1.0-trainval', + help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.') + parser.add_argument('--config_path', type=str, default='', + help='Path to the configuration file.' + 'If no path given, the CVPR 2019 configuration will be used.') + parser.add_argument('--plot_examples', type=int, default=0, + help='How many example visualizations to write to disk.') + parser.add_argument('--render_curves', type=int, default=1, + help='Whether to render PR and TP curves to disk.') + parser.add_argument('--verbose', type=int, default=1, + help='Whether to print to stdout.') + args = parser.parse_args() + + result_path_ = os.path.expanduser(args.result_path) + output_dir_ = os.path.expanduser(args.output_dir) + eval_set_ = args.eval_set + dataroot_ = args.dataroot + version_ = args.version + config_path = args.config_path + plot_examples_ = args.plot_examples + render_curves_ = bool(args.render_curves) + verbose_ = bool(args.verbose) + + if config_path == '': + cfg_ = config_factory('detection_cvpr_2019') + else: + with open(config_path, 'r') as _f: + cfg_ = DetectionConfig.deserialize(json.load(_f)) + + nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_) + nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_, + output_dir=output_dir_, verbose=verbose_) + for vis in ['1', '2', '3', '4']: + nusc_eval.update_gt(type_='vis', visibility=vis) + print(f'================ {vis} ===============') + nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_) + #for index in range(1, 41): + # nusc_eval.update_gt(type_='ord', index=index) + # diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f6587b06835be928919aa6c4c74981468cc0a3c --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/__init__.py @@ -0,0 +1,10 @@ +from .transform_3d import ( + PadMultiViewImage, NormalizeMultiviewImage, + PhotoMetricDistortionMultiViewImage, CustomCollect3D, RandomScaleImageMultiViewImage, CustomPointsRangeFilter) +from .formating import CustomDefaultFormatBundle3D + +from .loading import CustomLoadPointsFromFile, CustomLoadPointsFromMultiSweeps, CustomLoadMultiViewImageFromFiles +__all__ = [ + 'PadMultiViewImage', 'NormalizeMultiviewImage', + 'PhotoMetricDistortionMultiViewImage', 'CustomDefaultFormatBundle3D', 'CustomCollect3D', 'RandomScaleImageMultiViewImage' +] \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/formating.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/formating.py new file mode 100644 index 0000000000000000000000000000000000000000..d52a15c0a4a0063909b19907338808c06a820723 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/formating.py @@ -0,0 +1,39 @@ + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from mmcv.parallel import DataContainer as DC + +from mmdet3d.core.bbox import BaseInstance3DBoxes +from mmdet3d.core.points import BasePoints +from mmdet.datasets.builder import PIPELINES +from mmdet.datasets.pipelines import to_tensor +from mmdet3d.datasets.pipelines import DefaultFormatBundle3D + +@PIPELINES.register_module() +class CustomDefaultFormatBundle3D(DefaultFormatBundle3D): + """Default formatting bundle. + It simplifies the pipeline of formatting common fields for voxels, + including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and + "gt_semantic_seg". + These fields are formatted as follows. + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + """ + + def __call__(self, results): + """Call function to transform and format common fields in results. + Args: + results (dict): Result dict contains the data to convert. + Returns: + dict: The result dict contains the data that is formatted with + default bundle. + """ + # Format 3D data + results = super(CustomDefaultFormatBundle3D, self).__call__(results) + results['gt_map_masks'] = DC( + to_tensor(results['gt_map_masks']), stack=True) + + return results \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading.py new file mode 100644 index 0000000000000000000000000000000000000000..d6b1b6ae3fe070d80a546d332cb26b3de466d004 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading.py @@ -0,0 +1,368 @@ +import os +from typing import Any, Dict, Tuple + +import mmcv +import numpy as np +from nuscenes.map_expansion.map_api import NuScenesMap +from nuscenes.map_expansion.map_api import locations as LOCATIONS +from PIL import Image + + +from mmdet3d.core.points import BasePoints, get_points_type +from mmdet.datasets.builder import PIPELINES +from mmdet.datasets.pipelines import LoadAnnotations + +from .loading_utils import load_augmented_point_cloud, reduce_LiDAR_beams + + +@PIPELINES.register_module() +class CustomLoadMultiViewImageFromFiles(object): + """Load multi channel images from a list of separate channel files. + + Expects results['img_filename'] to be a list of filenames. + + Args: + to_float32 (bool): Whether to convert the img to float32. + Defaults to False. + color_type (str): Color type of the file. Defaults to 'unchanged'. + """ + + def __init__(self, to_float32=False, padding=True,pad_val=128, color_type='unchanged'): + self.to_float32 = to_float32 + self.color_type = color_type + self.padding = padding + self.pad_val = pad_val + + def __call__(self, results): + """Call function to load multi-view image from files. + + Args: + results (dict): Result dict containing multi-view image filenames. + + Returns: + dict: The result dict containing the multi-view image data. \ + Added keys and values are described below. + + - filename (str): Multi-view image filenames. + - img (np.ndarray): Multi-view image arrays. + - img_shape (tuple[int]): Shape of multi-view image arrays. + - ori_shape (tuple[int]): Shape of original image arrays. + - pad_shape (tuple[int]): Shape of padded image arrays. + - scale_factor (float): Scale factor. + - img_norm_cfg (dict): Normalization configuration of images. + """ + filename = results['img_filename'] + # img is of shape (h, w, c, num_views) + # img = np.stack( + # [mmcv.imread(name, self.color_type) for name in filename], axis=-1) + img_list = [mmcv.imread(name, self.color_type) for name in filename] + img_shape_list = [img.shape for img in img_list] + max_h = max([shape[0] for shape in img_shape_list]) + max_w = max([shape[1] for shape in img_shape_list]) + size = (max_h, max_w) + # import pdb;pdb.set_trace() + img_list = [mmcv.impad( + img, shape=size, pad_val=self.pad_val) for img in img_list] + + img = np.stack(img_list,axis=-1) + if self.to_float32: + img = img.astype(np.float32) + results['filename'] = filename + # unravel to list, see `DefaultFormatBundle` in formating.py + # which will transpose each image separately and then stack into array + results['img'] = [img[..., i] for i in range(img.shape[-1])] + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + # Set initial values for default meta_keys + results['pad_shape'] = img.shape + results['scale_factor'] = 1.0 + num_channels = 1 if len(img.shape) < 3 else img.shape[2] + results['img_norm_cfg'] = dict( + mean=np.zeros(num_channels, dtype=np.float32), + std=np.ones(num_channels, dtype=np.float32), + to_rgb=False) + return results + + def __repr__(self): + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f'(to_float32={self.to_float32}, ' + repr_str += f"color_type='{self.color_type}')" + return repr_str + +@PIPELINES.register_module() +class CustomLoadPointsFromMultiSweeps: + """Load points from multiple sweeps. + + This is usually used for nuScenes dataset to utilize previous sweeps. + + Args: + sweeps_num (int): Number of sweeps. Defaults to 10. + load_dim (int): Dimension number of the loaded points. Defaults to 5. + use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4]. + pad_empty_sweeps (bool): Whether to repeat keyframe when + sweeps is empty. Defaults to False. + remove_close (bool): Whether to remove close points. + Defaults to False. + test_mode (bool): If test_model=True used for testing, it will not + randomly sample sweeps but select the nearest N frames. + Defaults to False. + """ + + def __init__( + self, + sweeps_num=10, + load_dim=5, + use_dim=(0, 1, 2, 4), + pad_empty_sweeps=False, + remove_close=False, + test_mode=False, + load_augmented=None, + reduce_beams=None, + ): + self.load_dim = load_dim + self.sweeps_num = sweeps_num + if isinstance(use_dim, int): + use_dim = list(range(use_dim)) + self.use_dim = use_dim + self.pad_empty_sweeps = pad_empty_sweeps + self.remove_close = remove_close + self.test_mode = test_mode + self.load_augmented = load_augmented + self.reduce_beams = reduce_beams + + def _load_points(self, lidar_path): + """Private function to load point clouds data. + + Args: + lidar_path (str): Filename of point clouds data. + + Returns: + np.ndarray: An array containing point clouds data. + """ + mmcv.check_file_exist(lidar_path) + if self.load_augmented: + assert self.load_augmented in ["pointpainting", "mvp"] + virtual = self.load_augmented == "mvp" + points = load_augmented_point_cloud( + lidar_path, virtual=virtual, reduce_beams=self.reduce_beams + ) + elif lidar_path.endswith(".npy"): + points = np.load(lidar_path) + else: + points = np.fromfile(lidar_path, dtype=np.float32) + return points + + def _remove_close(self, points, radius=1.0): + """Removes point too close within a certain radius from origin. + + Args: + points (np.ndarray | :obj:`BasePoints`): Sweep points. + radius (float): Radius below which points are removed. + Defaults to 1.0. + + Returns: + np.ndarray: Points after removing. + """ + if isinstance(points, np.ndarray): + points_numpy = points + elif isinstance(points, BasePoints): + points_numpy = points.tensor.numpy() + else: + raise NotImplementedError + x_filt = np.abs(points_numpy[:, 0]) < radius + y_filt = np.abs(points_numpy[:, 1]) < radius + not_close = np.logical_not(np.logical_and(x_filt, y_filt)) + return points[not_close] + + def __call__(self, results): + """Call function to load multi-sweep point clouds from files. + + Args: + results (dict): Result dict containing multi-sweep point cloud \ + filenames. + + Returns: + dict: The result dict containing the multi-sweep points data. \ + Added key and value are described below. + + - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \ + cloud arrays. + """ + points = results["points"] + points.tensor[:, 4] = 0 + sweep_points_list = [points] + ts = results["timestamp"] / 1e6 + if self.pad_empty_sweeps and len(results["sweeps"]) == 0: + for i in range(self.sweeps_num): + if self.remove_close: + sweep_points_list.append(self._remove_close(points)) + else: + sweep_points_list.append(points) + else: + if len(results["sweeps"]) <= self.sweeps_num: + choices = np.arange(len(results["sweeps"])) + elif self.test_mode: + choices = np.arange(self.sweeps_num) + else: + # NOTE: seems possible to load frame -11? + if not self.load_augmented: + choices = np.random.choice( + len(results["sweeps"]), self.sweeps_num, replace=False + ) + else: + # don't allow to sample the earliest frame, match with Tianwei's implementation. + choices = np.random.choice( + len(results["sweeps"]) - 1, self.sweeps_num, replace=False + ) + for idx in choices: + sweep = results["sweeps"][idx] + points_sweep = self._load_points(sweep["data_path"]) + points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim) + + # TODO: make it more general + if self.reduce_beams and self.reduce_beams < 32: + points_sweep = reduce_LiDAR_beams(points_sweep, self.reduce_beams) + + if self.remove_close: + points_sweep = self._remove_close(points_sweep) + sweep_ts = sweep["timestamp"] / 1e6 + points_sweep[:, :3] = ( + points_sweep[:, :3] @ sweep["sensor2lidar_rotation"].T + ) + points_sweep[:, :3] += sweep["sensor2lidar_translation"] + points_sweep[:, 4] = ts - sweep_ts + points_sweep = points.new_point(points_sweep) + sweep_points_list.append(points_sweep) + + points = points.cat(sweep_points_list) + points = points[:, self.use_dim] + results["points"] = points + return results + + def __repr__(self): + """str: Return a string that describes the module.""" + return f"{self.__class__.__name__}(sweeps_num={self.sweeps_num})" + + + +@PIPELINES.register_module() +class CustomLoadPointsFromFile: + """Load Points From File. + + Load sunrgbd and scannet points from file. + + Args: + coord_type (str): The type of coordinates of points cloud. + Available options includes: + - 'LIDAR': Points in LiDAR coordinates. + - 'DEPTH': Points in depth coordinates, usually for indoor dataset. + - 'CAMERA': Points in camera coordinates. + load_dim (int): The dimension of the loaded points. + Defaults to 6. + use_dim (list[int]): Which dimensions of the points to be used. + Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4 + or use_dim=[0, 1, 2, 3] to use the intensity dimension. + shift_height (bool): Whether to use shifted height. Defaults to False. + use_color (bool): Whether to use color features. Defaults to False. + """ + + def __init__( + self, + coord_type, + load_dim=6, + use_dim=(0, 1, 2), + shift_height=False, + use_color=False, + load_augmented=None, + reduce_beams=None, + ): + self.shift_height = shift_height + self.use_color = use_color + if isinstance(use_dim, int): + use_dim = list(range(use_dim)) + assert ( + max(use_dim) < load_dim + ), f"Expect all used dimensions < {load_dim}, got {use_dim}" + assert coord_type in ["CAMERA", "LIDAR", "DEPTH"] + + self.coord_type = coord_type + self.load_dim = load_dim + self.use_dim = use_dim + self.load_augmented = load_augmented + self.reduce_beams = reduce_beams + + def _load_points(self, lidar_path): + """Private function to load point clouds data. + + Args: + lidar_path (str): Filename of point clouds data. + + Returns: + np.ndarray: An array containing point clouds data. + """ + mmcv.check_file_exist(lidar_path) + if self.load_augmented: + assert self.load_augmented in ["pointpainting", "mvp"] + virtual = self.load_augmented == "mvp" + points = load_augmented_point_cloud( + lidar_path, virtual=virtual, reduce_beams=self.reduce_beams + ) + elif lidar_path.endswith(".npy"): + points = np.load(lidar_path) + else: + points = np.fromfile(lidar_path, dtype=np.float32) + + return points + + def __call__(self, results): + """Call function to load points data from file. + + Args: + results (dict): Result dict containing point clouds data. + + Returns: + dict: The result dict containing the point clouds data. \ + Added key and value are described below. + + - points (:obj:`BasePoints`): Point clouds data. + """ + lidar_path = results["lidar_path"] + points = self._load_points(lidar_path) + points = points.reshape(-1, self.load_dim) + # TODO: make it more general + if self.reduce_beams and self.reduce_beams < 32: + points = reduce_LiDAR_beams(points, self.reduce_beams) + points = points[:, self.use_dim] + attribute_dims = None + + if self.shift_height: + floor_height = np.percentile(points[:, 2], 0.99) + height = points[:, 2] - floor_height + points = np.concatenate( + [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1 + ) + attribute_dims = dict(height=3) + + if self.use_color: + assert len(self.use_dim) >= 6 + if attribute_dims is None: + attribute_dims = dict() + attribute_dims.update( + dict( + color=[ + points.shape[1] - 3, + points.shape[1] - 2, + points.shape[1] - 1, + ] + ) + ) + + points_class = get_points_type(self.coord_type) + points = points_class( + points, points_dim=points.shape[-1], attribute_dims=attribute_dims + ) + results["points"] = points + + return results + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading_utils.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c66dbb2d8cb0865d189d22ce17b91d73da342ffb --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading_utils.py @@ -0,0 +1,107 @@ +import os + +import numpy as np +import torch + +__all__ = ["load_augmented_point_cloud", "reduce_LiDAR_beams"] + + +def load_augmented_point_cloud(path, virtual=False, reduce_beams=32): + # NOTE: following Tianwei's implementation, it is hard coded for nuScenes + points = np.fromfile(path, dtype=np.float32).reshape(-1, 5) + # NOTE: path definition different from Tianwei's implementation. + tokens = path.split("/") + vp_dir = "_VIRTUAL" if reduce_beams == 32 else f"_VIRTUAL_{reduce_beams}BEAMS" + seg_path = os.path.join( + *tokens[:-3], + "virtual_points", + tokens[-3], + tokens[-2] + vp_dir, + tokens[-1] + ".pkl.npy", + ) + assert os.path.exists(seg_path) + data_dict = np.load(seg_path, allow_pickle=True).item() + + virtual_points1 = data_dict["real_points"] + # NOTE: add zero reflectance to virtual points instead of removing them from real points + virtual_points2 = np.concatenate( + [ + data_dict["virtual_points"][:, :3], + np.zeros([data_dict["virtual_points"].shape[0], 1]), + data_dict["virtual_points"][:, 3:], + ], + axis=-1, + ) + + points = np.concatenate( + [ + points, + np.ones([points.shape[0], virtual_points1.shape[1] - points.shape[1] + 1]), + ], + axis=1, + ) + virtual_points1 = np.concatenate( + [virtual_points1, np.zeros([virtual_points1.shape[0], 1])], axis=1 + ) + # note: this part is different from Tianwei's implementation, we don't have duplicate foreground real points. + if len(data_dict["real_points_indice"]) > 0: + points[data_dict["real_points_indice"]] = virtual_points1 + if virtual: + virtual_points2 = np.concatenate( + [virtual_points2, -1 * np.ones([virtual_points2.shape[0], 1])], axis=1 + ) + points = np.concatenate([points, virtual_points2], axis=0).astype(np.float32) + return points + + +def reduce_LiDAR_beams(pts, reduce_beams_to=32): + # print(pts.size()) + if isinstance(pts, np.ndarray): + pts = torch.from_numpy(pts) + radius = torch.sqrt(pts[:, 0].pow(2) + pts[:, 1].pow(2) + pts[:, 2].pow(2)) + sine_theta = pts[:, 2] / radius + # [-pi/2, pi/2] + theta = torch.asin(sine_theta) + phi = torch.atan2(pts[:, 1], pts[:, 0]) + + top_ang = 0.1862 + down_ang = -0.5353 + + beam_range = torch.zeros(32) + beam_range[0] = top_ang + beam_range[31] = down_ang + + for i in range(1, 31): + beam_range[i] = beam_range[i - 1] - 0.023275 + # beam_range = [1, 0.18, 0.15, 0.13, 0.11, 0.085, 0.065, 0.03, 0.01, -0.01, -0.03, -0.055, -0.08, -0.105, -0.13, -0.155, -0.18, -0.205, -0.228, -0.251, -0.275, + # -0.295, -0.32, -0.34, -0.36, -0.38, -0.40, -0.425, -0.45, -0.47, -0.49, -0.52, -0.54] + + num_pts, _ = pts.size() + mask = torch.zeros(num_pts) + if reduce_beams_to == 16: + for id in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]: + beam_mask = (theta < (beam_range[id - 1] - 0.012)) * ( + theta > (beam_range[id] - 0.012) + ) + mask = mask + beam_mask + mask = mask.bool() + elif reduce_beams_to == 4: + for id in [7, 9, 11, 13]: + beam_mask = (theta < (beam_range[id - 1] - 0.012)) * ( + theta > (beam_range[id] - 0.012) + ) + mask = mask + beam_mask + mask = mask.bool() + # [?] pick the 14th beam + elif reduce_beams_to == 1: + chosen_beam_id = 9 + mask = (theta < (beam_range[chosen_beam_id - 1] - 0.012)) * ( + theta > (beam_range[chosen_beam_id] - 0.012) + ) + else: + raise NotImplementedError + # points = copy.copy(pts) + points = pts[mask] + # print(points.size()) + return points.numpy() + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..68a4e2ce5822a06bc7e475c1657a3eed646f5285 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py @@ -0,0 +1,355 @@ +import numpy as np +from numpy import random +import mmcv +from mmdet.datasets.builder import PIPELINES +from mmcv.parallel import DataContainer as DC + +@PIPELINES.register_module() +class PadMultiViewImage(object): + """Pad the multi-view image. + There are two padding modes: (1) pad to a fixed size and (2) pad to the + minimum size that is divisible by some number. + Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", + Args: + size (tuple, optional): Fixed padding size. + size_divisor (int, optional): The divisor of padded size. + pad_val (float, optional): Padding value, 0 by default. + """ + + def __init__(self, size=None, size_divisor=None, pad_val=0): + self.size = size + self.size_divisor = size_divisor + self.pad_val = pad_val + # only one of size and size_divisor should be valid + assert size is not None or size_divisor is not None + assert size is None or size_divisor is None + + def _pad_img(self, results): + """Pad images according to ``self.size``.""" + if self.size is not None: + padded_img = [mmcv.impad( + img, shape=self.size, pad_val=self.pad_val) for img in results['img']] + elif self.size_divisor is not None: + padded_img = [mmcv.impad_to_multiple( + img, self.size_divisor, pad_val=self.pad_val) for img in results['img']] + + results['ori_shape'] = [img.shape for img in results['img']] + results['img'] = padded_img + results['img_shape'] = [img.shape for img in padded_img] + results['pad_shape'] = [img.shape for img in padded_img] + results['pad_fixed_size'] = self.size + results['pad_size_divisor'] = self.size_divisor + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Updated result dict. + """ + self._pad_img(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.size}, ' + repr_str += f'size_divisor={self.size_divisor}, ' + repr_str += f'pad_val={self.pad_val})' + return repr_str + + +@PIPELINES.register_module() +class NormalizeMultiviewImage(object): + """Normalize the image. + Added key is "img_norm_cfg". + Args: + mean (sequence): Mean values of 3 channels. + std (sequence): Std values of 3 channels. + to_rgb (bool): Whether to convert the image from BGR to RGB, + default is true. + """ + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + + def __call__(self, results): + """Call function to normalize images. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Normalized results, 'img_norm_cfg' key is added into + result dict. + """ + + results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']] + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' + return repr_str + + +@PIPELINES.register_module() +class PhotoMetricDistortionMultiViewImage: + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + 8. randomly swap channels + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def __call__(self, results): + """Call function to perform photometric distortion on images. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Result dict with images distorted. + """ + imgs = results['img'] + new_imgs = [] + for img in imgs: + assert img.dtype == np.float32, \ + 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ + ' please set "to_float32=True" in "LoadImageFromFile" pipeline' + # random brightness + if random.randint(2): + delta = random.uniform(-self.brightness_delta, + self.brightness_delta) + img += delta + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = random.randint(2) + if mode == 1: + if random.randint(2): + alpha = random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # convert color from BGR to HSV + img = mmcv.bgr2hsv(img) + + # random saturation + if random.randint(2): + img[..., 1] *= random.uniform(self.saturation_lower, + self.saturation_upper) + + # random hue + if random.randint(2): + img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) + img[..., 0][img[..., 0] > 360] -= 360 + img[..., 0][img[..., 0] < 0] += 360 + + # convert color from HSV to BGR + img = mmcv.hsv2bgr(img) + + # random contrast + if mode == 0: + if random.randint(2): + alpha = random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # randomly swap channels + if random.randint(2): + img = img[..., random.permutation(3)] + new_imgs.append(img) + results['img'] = new_imgs + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' + repr_str += 'contrast_range=' + repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' + repr_str += 'saturation_range=' + repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' + repr_str += f'hue_delta={self.hue_delta})' + return repr_str + + + +@PIPELINES.register_module() +class CustomCollect3D(object): + """Collect data from the loader relevant to the specific task. + This is usually the last stage of the data loader pipeline. Typically keys + is set to some subset of "img", "proposals", "gt_bboxes", + "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". + The "img_meta" item is always populated. The contents of the "img_meta" + dictionary depends on "meta_keys". By default this includes: + - 'img_shape': shape of the image input to the network as a tuple \ + (h, w, c). Note that images may be zero padded on the \ + bottom/right if the batch tensor is larger than this shape. + - 'scale_factor': a float indicating the preprocessing scale + - 'flip': a boolean indicating if image flip transform was used + - 'filename': path to the image file + - 'ori_shape': original shape of the image as a tuple (h, w, c) + - 'pad_shape': image shape after padding + - 'lidar2img': transform from lidar to image + - 'depth2img': transform from depth to image + - 'cam2img': transform from camera to image + - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ + flipped horizontally + - 'pcd_vertical_flip': a boolean indicating if point cloud is \ + flipped vertically + - 'box_mode_3d': 3D box mode + - 'box_type_3d': 3D box type + - 'img_norm_cfg': a dict of normalization information: + - mean: per channel mean subtraction + - std: per channel std divisor + - to_rgb: bool indicating if bgr was converted to rgb + - 'pcd_trans': point cloud transformations + - 'sample_idx': sample index + - 'pcd_scale_factor': point cloud scale factor + - 'pcd_rotation': rotation applied to point cloud + - 'pts_filename': path to point cloud file. + Args: + keys (Sequence[str]): Keys of results to be collected in ``data``. + meta_keys (Sequence[str], optional): Meta keys to be converted to + ``mmcv.DataContainer`` and collected in ``data[img_metas]``. + Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', + 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', + 'box_type_3d', 'img_norm_cfg', 'pcd_trans', + 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') + """ + + def __init__(self, + keys, + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'depth2img', 'cam2img', 'pad_shape', + 'scale_factor', 'flip', 'pcd_horizontal_flip', + 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', + 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx', + 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', + 'transformation_3d_flow', 'scene_token', + 'can_bus','lidar2global', + 'camera2ego','camera_intrinsics','img_aug_matrix','lidar2ego' + )): + self.keys = keys + self.meta_keys = meta_keys + + def __call__(self, results): + """Call function to collect keys in results. The keys in ``meta_keys`` + will be converted to :obj:`mmcv.DataContainer`. + Args: + results (dict): Result dict contains the data to collect. + Returns: + dict: The result dict contains the following keys + - keys in ``self.keys`` + - ``img_metas`` + """ + + data = {} + img_metas = {} + + for key in self.meta_keys: + if key in results: + img_metas[key] = results[key] + + data['img_metas'] = DC(img_metas, cpu_only=True) + for key in self.keys: + data[key] = results[key] + return data + + def __repr__(self): + """str: Return a string that describes the module.""" + return self.__class__.__name__ + \ + f'(keys={self.keys}, meta_keys={self.meta_keys})' + + + +@PIPELINES.register_module() +class RandomScaleImageMultiViewImage(object): + """Random scale the image + Args: + scales + """ + + def __init__(self, scales=[]): + self.scales = scales + assert len(self.scales)==1 + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Updated result dict. + """ + rand_ind = np.random.permutation(range(len(self.scales)))[0] + rand_scale = self.scales[rand_ind] + + y_size = [int(img.shape[0] * rand_scale) for img in results['img']] + x_size = [int(img.shape[1] * rand_scale) for img in results['img']] + scale_factor = np.eye(4) + scale_factor[0, 0] *= rand_scale + scale_factor[1, 1] *= rand_scale + results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in + enumerate(results['img'])] + lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']] + img_aug_matrix = [scale_factor for _ in results['lidar2img']] + results['lidar2img'] = lidar2img + results['img_aug_matrix'] = img_aug_matrix + results['img_shape'] = [img.shape for img in results['img']] + results['ori_shape'] = [img.shape for img in results['img']] + + return results + + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.scales}, ' + return repr_str + + +@PIPELINES.register_module() +class CustomPointsRangeFilter: + """Filter points by the range. + Args: + point_cloud_range (list[float]): Point cloud range. + """ + + def __init__(self, point_cloud_range): + self.pcd_range = np.array(point_cloud_range, dtype=np.float32) + + def __call__(self, data): + """Call function to filter points by the range. + Args: + data (dict): Result dict from loading pipeline. + Returns: + dict: Results after filtering, 'points', 'pts_instance_mask' \ + and 'pts_semantic_mask' keys are updated in the result dict. + """ + points = data["points"] + points_mask = points.in_range_3d(self.pcd_range) + clean_points = points[points_mask] + data["points"] = clean_points + return data \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb2a0b17769a958042583dcb4c8c4a4f51636f4c --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/__init__.py @@ -0,0 +1,4 @@ +from .group_sampler import DistributedGroupSampler +from .distributed_sampler import DistributedSampler +from .sampler import SAMPLER, build_sampler + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..2913de99253be744a308bbc24c5bcaf3cd4a857c --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py @@ -0,0 +1,41 @@ +import math + +import torch +from torch.utils.data import DistributedSampler as _DistributedSampler +from .sampler import SAMPLER + + +@SAMPLER.register_module() +class DistributedSampler(_DistributedSampler): + + def __init__(self, + dataset=None, + num_replicas=None, + rank=None, + shuffle=True, + seed=0): + super().__init__( + dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + # for the compatibility from PyTorch 1.3+ + self.seed = seed if seed is not None else 0 + + def __iter__(self): + # deterministically shuffle based on epoch + if self.shuffle: + assert False + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + # in case that indices is shorter than half of total_size + indices = (indices * + math.ceil(self.total_size / len(indices)))[:self.total_size] + assert len(indices) == self.total_size + + # subsample + per_replicas = self.total_size//self.num_replicas + # indices = indices[self.rank:self.total_size:self.num_replicas] + indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas] + assert len(indices) == self.num_samples + + return iter(indices) diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..16c59e5f3dd880ba185247acfba6eae354deb771 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py @@ -0,0 +1,110 @@ + +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +import torch +from mmcv.runner import get_dist_info +from torch.utils.data import Sampler +from .sampler import SAMPLER +import random +from IPython import embed + + +@SAMPLER.register_module() +class DistributedGroupSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + seed (int, optional): random seed used to shuffle the sampler if + ``shuffle=True``. This number should be identical across all + processes in the distributed group. Default: 0. + """ + + def __init__(self, + dataset, + samples_per_gpu=1, + num_replicas=None, + rank=None, + seed=0): + _rank, _num_replicas = get_dist_info() + if num_replicas is None: + num_replicas = _num_replicas + if rank is None: + rank = _rank + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.seed = seed if seed is not None else 0 + + assert hasattr(self.dataset, 'flag') + self.flag = self.dataset.flag + self.group_sizes = np.bincount(self.flag) + + self.num_samples = 0 + for i, j in enumerate(self.group_sizes): + self.num_samples += int( + math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / + self.num_replicas)) * self.samples_per_gpu + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch + self.seed) + + indices = [] + for i, size in enumerate(self.group_sizes): + if size > 0: + indice = np.where(self.flag == i)[0] + assert len(indice) == size + # add .numpy() to avoid bug when selecting indice in parrots. + # TODO: check whether torch.randperm() can be replaced by + # numpy.random.permutation(). + indice = indice[list( + torch.randperm(int(size), generator=g).numpy())].tolist() + extra = int( + math.ceil( + size * 1.0 / self.samples_per_gpu / self.num_replicas) + ) * self.samples_per_gpu * self.num_replicas - len(indice) + # pad indice + tmp = indice.copy() + for _ in range(extra // size): + indice.extend(tmp) + indice.extend(tmp[:extra % size]) + indices.extend(indice) + + assert len(indices) == self.total_size + + indices = [ + indices[j] for i in list( + torch.randperm( + len(indices) // self.samples_per_gpu, generator=g)) + for j in range(i * self.samples_per_gpu, (i + 1) * + self.samples_per_gpu) + ] + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset:offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/sampler.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..30276cff64d0df162daa9094f6f46fdb07848132 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/sampler.py @@ -0,0 +1,7 @@ +from mmcv.utils.registry import Registry, build_from_cfg + +SAMPLER = Registry('sampler') + + +def build_sampler(cfg, default_args): + return build_from_cfg(cfg, SAMPLER, default_args) diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..df5f2d359f4cd5292cce95c32b859abbab97529c --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/__init__.py @@ -0,0 +1,5 @@ +from .assigners import * +from .dense_heads import * +from .detectors import * +from .modules import * +from .losses import * diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f6c4b0ff320a2a38821dba476c16729d25246feb --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/__init__.py @@ -0,0 +1 @@ +from .maptr_assigner import MapTRAssigner \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/maptr_assigner.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/maptr_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..80d3c9090680df4c7377dd2672aa0b21d60816cd --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/maptr_assigner.py @@ -0,0 +1,196 @@ +import torch +from mmdet.core.bbox.builder import BBOX_ASSIGNERS +from mmdet.core.bbox.assigners import AssignResult +from mmdet.core.bbox.assigners import BaseAssigner +from mmdet.core.bbox.match_costs import build_match_cost +import torch.nn.functional as F +from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy +try: + from scipy.optimize import linear_sum_assignment +except ImportError: + linear_sum_assignment = None + +def normalize_2d_bbox(bboxes, pc_range): + + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes) + cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0] + cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1] + factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h]) + + normalized_bboxes = cxcywh_bboxes / factor + return normalized_bboxes + +def normalize_2d_pts(pts, pc_range): + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + new_pts = pts.clone() + new_pts[...,0:1] = pts[..., 0:1] - pc_range[0] + new_pts[...,1:2] = pts[...,1:2] - pc_range[1] + factor = pts.new_tensor([patch_w, patch_h]) + normalized_pts = new_pts / factor + return normalized_pts + +def denormalize_2d_bbox(bboxes, pc_range): + + bboxes = bbox_cxcywh_to_xyxy(bboxes) + bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + + return bboxes +def denormalize_2d_pts(pts, pc_range): + new_pts = pts.clone() + new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + return new_pts + +@BBOX_ASSIGNERS.register_module() +class MapTRAssigner(BaseAssigner): + """Computes one-to-one matching between predictions and ground truth. + This class computes an assignment between the targets and the predictions + based on the costs. The costs are weighted sum of three components: + classification cost, regression L1 cost and regression iou cost. The + targets don't include the no_object, so generally there are more + predictions than targets. After the one-to-one matching, the un-matched + are treated as backgrounds. Thus each query prediction will be assigned + with `0` or a positive integer indicating the ground truth index: + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + Args: + cls_weight (int | float, optional): The scale factor for classification + cost. Default 1.0. + bbox_weight (int | float, optional): The scale factor for regression + L1 cost. Default 1.0. + iou_weight (int | float, optional): The scale factor for regression + iou cost. Default 1.0. + iou_calculator (dict | optional): The config for the iou calculation. + Default type `BboxOverlaps2D`. + iou_mode (str | optional): "iou" (intersection over union), "iof" + (intersection over foreground), or "giou" (generalized + intersection over union). Default "giou". + """ + + def __init__(self, + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=1.0), + iou_cost=dict(type='IoUCost', weight=0.0), + pts_cost=dict(type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0), + pc_range=None): + self.cls_cost = build_match_cost(cls_cost) + self.reg_cost = build_match_cost(reg_cost) + self.iou_cost = build_match_cost(iou_cost) + self.pts_cost = build_match_cost(pts_cost) + self.pc_range = pc_range + + def assign(self, + bbox_pred, + cls_pred, + pts_pred, + gt_bboxes, + gt_labels, + gt_pts, + gt_bboxes_ignore=None, + eps=1e-7): + """Computes one-to-one matching based on the weighted costs. + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + 1. assign every prediction to -1 + 2. compute the weighted costs + 3. do Hungarian matching on CPU based on the costs + 4. assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + cls_pred (Tensor): Predicted classification logits, shape + [num_query, num_class]. + gt_bboxes (Tensor): Ground truth boxes with unnormalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`. Default None. + eps (int | float, optional): A value added to the denominator for + numerical stability. Default 1e-7. + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert gt_bboxes_ignore is None, \ + 'Only case when gt_bboxes_ignore is None is supported.' + assert bbox_pred.shape[-1] == 4, \ + 'Only support bbox pred shape is 4 dims' + num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) + + # 1. assign -1 by default + assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + assigned_labels = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels), None + + # 2. compute the weighted costs + # classification and bboxcost. + cls_cost = self.cls_cost(cls_pred, gt_labels) + # regression L1 cost + + normalized_gt_bboxes = normalize_2d_bbox(gt_bboxes, self.pc_range) + # normalized_gt_bboxes = gt_bboxes + # import pdb;pdb.set_trace() + reg_cost = self.reg_cost(bbox_pred[:, :4], normalized_gt_bboxes[:, :4]) + + _, num_orders, num_pts_per_gtline, num_coords = gt_pts.shape + normalized_gt_pts = normalize_2d_pts(gt_pts, self.pc_range) + num_pts_per_predline = pts_pred.size(1) + if num_pts_per_predline != num_pts_per_gtline: + pts_pred_interpolated = F.interpolate(pts_pred.permute(0,2,1),size=(num_pts_per_gtline), + mode='linear', align_corners=True) + pts_pred_interpolated = pts_pred_interpolated.permute(0,2,1).contiguous() + else: + pts_pred_interpolated = pts_pred + # num_q, num_pts, 2 <-> num_gt, num_pts, 2 + pts_cost_ordered = self.pts_cost(pts_pred_interpolated, normalized_gt_pts) + pts_cost_ordered = pts_cost_ordered.view(num_bboxes, num_gts, num_orders) + pts_cost, order_index = torch.min(pts_cost_ordered, 2) + + bboxes = denormalize_2d_bbox(bbox_pred, self.pc_range) + iou_cost = self.iou_cost(bboxes, gt_bboxes) + # weighted sum of above three costs + cost = cls_cost + reg_cost + iou_cost + pts_cost + + # 3. do Hungarian matching on CPU using linear_sum_assignment + cost = cost.detach().cpu() + if linear_sum_assignment is None: + raise ImportError('Please run "pip install scipy" ' + 'to install scipy first.') + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + matched_row_inds = torch.from_numpy(matched_row_inds).to( + bbox_pred.device) + matched_col_inds = torch.from_numpy(matched_col_inds).to( + bbox_pred.device) + + # 4. assign backgrounds and foregrounds + # assign all indices to backgrounds first + assigned_gt_inds[:] = 0 + # assign foregrounds based on matching results + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels), order_index + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..27598e1a959f1443b4f77f8b101a3ca7da49ef2a --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/__init__.py @@ -0,0 +1 @@ +from .maptr_head import MapTRHead \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/maptr_head.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/maptr_head.py new file mode 100644 index 0000000000000000000000000000000000000000..932ac7f686d39db9380f38300f616bde0685a560 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/maptr_head.py @@ -0,0 +1,772 @@ +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.models import HEADS, build_loss +from mmdet.models.dense_heads import DETRHead +from mmdet3d.core.bbox.coders import build_bbox_coder +from mmcv.runner import force_fp32, auto_fp16 +from mmcv.cnn import Linear, bias_init_with_prob, xavier_init, constant_init +from mmdet.models.utils.transformer import inverse_sigmoid +from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy +from mmdet.core import (multi_apply, multi_apply, reduce_mean) +from mmcv.utils import TORCH_VERSION, digit_version + +def normalize_2d_bbox(bboxes, pc_range): + + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes) + cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0] + cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1] + factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h]) + + normalized_bboxes = cxcywh_bboxes / factor + return normalized_bboxes + +def normalize_2d_pts(pts, pc_range): + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + new_pts = pts.clone() + new_pts[...,0:1] = pts[..., 0:1] - pc_range[0] + new_pts[...,1:2] = pts[...,1:2] - pc_range[1] + factor = pts.new_tensor([patch_w, patch_h]) + normalized_pts = new_pts / factor + return normalized_pts + +def denormalize_2d_bbox(bboxes, pc_range): + + bboxes = bbox_cxcywh_to_xyxy(bboxes) + bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + + return bboxes +def denormalize_2d_pts(pts, pc_range): + new_pts = pts.clone() + new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + return new_pts +@HEADS.register_module() +class MapTRHead(DETRHead): + """Head of Detr3D. + Args: + with_box_refine (bool): Whether to refine the reference points + in the decoder. Defaults to False. + as_two_stage (bool) : Whether to generate the proposal from + the outputs of encoder. + transformer (obj:`ConfigDict`): ConfigDict is used for building + the Encoder and Decoder. + bev_h, bev_w (int): spatial shape of BEV queries. + """ + + def __init__(self, + *args, + with_box_refine=False, + as_two_stage=False, + transformer=None, + bbox_coder=None, + num_cls_fcs=2, + code_weights=None, + bev_h=30, + bev_w=30, + num_vec=20, + num_pts_per_vec=2, + num_pts_per_gt_vec=2, + query_embed_type='all_pts', + transform_method='minmax', + gt_shift_pts_pattern='v0', + dir_interval=1, + loss_pts=dict(type='ChamferDistance', + loss_src_weight=1.0, + loss_dst_weight=1.0), + loss_dir=dict(type='PtsDirCosLoss', loss_weight=2.0), + **kwargs): + + self.bev_h = bev_h + self.bev_w = bev_w + self.fp16_enabled = False + + self.with_box_refine = with_box_refine + self.as_two_stage = as_two_stage + self.bev_encoder_type = transformer.encoder.type + if self.as_two_stage: + transformer['as_two_stage'] = self.as_two_stage + if 'code_size' in kwargs: + self.code_size = kwargs['code_size'] + else: + self.code_size = 10 + if code_weights is not None: + self.code_weights = code_weights + else: + self.code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + + self.bbox_coder = build_bbox_coder(bbox_coder) + self.pc_range = self.bbox_coder.pc_range + self.real_w = self.pc_range[3] - self.pc_range[0] + self.real_h = self.pc_range[4] - self.pc_range[1] + self.num_cls_fcs = num_cls_fcs - 1 + + + self.query_embed_type = query_embed_type + self.transform_method = transform_method + self.gt_shift_pts_pattern = gt_shift_pts_pattern + num_query = num_vec * num_pts_per_vec + self.num_query = num_query + self.num_vec = num_vec + self.num_pts_per_vec = num_pts_per_vec + self.num_pts_per_gt_vec = num_pts_per_gt_vec + self.dir_interval = dir_interval + + + super(MapTRHead, self).__init__( + *args, transformer=transformer, **kwargs) + self.code_weights = nn.Parameter(torch.tensor( + self.code_weights, requires_grad=False), requires_grad=False) + self.loss_pts = build_loss(loss_pts) + self.loss_dir = build_loss(loss_dir) + num_query = num_vec * num_pts_per_vec + self.num_query = num_query + self.num_vec = num_vec + self.num_pts_per_vec = num_pts_per_vec + self.num_pts_per_gt_vec = num_pts_per_gt_vec + self._init_layers() + + def _init_layers(self): + """Initialize classification branch and regression branch of head.""" + cls_branch = [] + # cls_branch.append(Linear(self.embed_dims * 2, self.embed_dims)) + # cls_branch.append(nn.LayerNorm(self.embed_dims)) + # cls_branch.append(nn.ReLU(inplace=True)) + for _ in range(self.num_reg_fcs): + cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + cls_branch.append(nn.LayerNorm(self.embed_dims)) + cls_branch.append(nn.ReLU(inplace=True)) + cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) + fc_cls = nn.Sequential(*cls_branch) + + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.code_size)) + reg_branch = nn.Sequential(*reg_branch) + + def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + # last reg_branch is used to generate proposal from + # encode feature map when as_two_stage is True. + num_pred = (self.transformer.decoder.num_layers + 1) if \ + self.as_two_stage else self.transformer.decoder.num_layers + + if self.with_box_refine: + self.cls_branches = _get_clones(fc_cls, num_pred) + self.reg_branches = _get_clones(reg_branch, num_pred) + else: + self.cls_branches = nn.ModuleList( + [fc_cls for _ in range(num_pred)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(num_pred)]) + + if not self.as_two_stage: + if self.bev_encoder_type == 'BEVFormerEncoder': + self.bev_embedding = nn.Embedding( + self.bev_h * self.bev_w, self.embed_dims) + else: + self.bev_embedding = None + if self.query_embed_type == 'all_pts': + self.query_embedding = nn.Embedding(self.num_query, + self.embed_dims * 2) + elif self.query_embed_type == 'instance_pts': + self.query_embedding = None + self.instance_embedding = nn.Embedding(self.num_vec, self.embed_dims * 2) + self.pts_embedding = nn.Embedding(self.num_pts_per_vec, self.embed_dims * 2) + + def init_weights(self): + """Initialize weights of the DeformDETR head.""" + self.transformer.init_weights() + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + # for m in self.reg_branches: + # constant_init(m[-1], 0, bias=0) + # nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], 0.) + + # @auto_fp16(apply_to=('mlvl_feats')) + @force_fp32(apply_to=('mlvl_feats', 'prev_bev')) + def forward(self, mlvl_feats, lidar_feat, img_metas, prev_bev=None, only_bev=False): + """Forward function. + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 5D-tensor with shape + (B, N, C, H, W). + prev_bev: previous bev featues + only_bev: only compute BEV features with encoder. + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + """ + + bs, num_cam, _, _, _ = mlvl_feats[0].shape + dtype = mlvl_feats[0].dtype + # import pdb;pdb.set_trace() + if self.query_embed_type == 'all_pts': + object_query_embeds = self.query_embedding.weight.to(dtype) + elif self.query_embed_type == 'instance_pts': + pts_embeds = self.pts_embedding.weight.unsqueeze(0) + instance_embeds = self.instance_embedding.weight.unsqueeze(1) + object_query_embeds = (pts_embeds + instance_embeds).flatten(0, 1).to(dtype) + if self.bev_embedding is not None: + bev_queries = self.bev_embedding.weight.to(dtype) + + bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), + device=bev_queries.device).to(dtype) + bev_pos = self.positional_encoding(bev_mask).to(dtype) + else: + bev_queries = None + bev_mask = None + bev_pos = None + + if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround + return self.transformer.get_bev_features( + mlvl_feats, + lidar_feat, + bev_queries, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + img_metas=img_metas, + prev_bev=prev_bev, + ) + else: + outputs = self.transformer( + mlvl_feats, + lidar_feat, + bev_queries, + object_query_embeds, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + cls_branches=self.cls_branches if self.as_two_stage else None, + img_metas=img_metas, + prev_bev=prev_bev + ) + + bev_embed, hs, init_reference, inter_references = outputs + hs = hs.permute(0, 2, 1, 3) + outputs_classes = [] + outputs_coords = [] + outputs_pts_coords = [] + for lvl in range(hs.shape[0]): + if lvl == 0: + # import pdb;pdb.set_trace() + reference = init_reference + else: + reference = inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + # import pdb;pdb.set_trace() + # vec_embedding = hs[lvl].reshape(bs, self.num_vec, -1) + outputs_class = self.cls_branches[lvl](hs[lvl] + .view(bs,self.num_vec, self.num_pts_per_vec,-1) + .mean(2)) + tmp = self.reg_branches[lvl](hs[lvl]) + + # TODO: check the shape of reference + assert reference.shape[-1] == 2 + tmp[..., 0:2] += reference[..., 0:2] + # tmp[..., 0:2] = tmp[..., 0:2].sigmoid() + tmp = tmp.sigmoid() # cx,cy,w,h + # import pdb;pdb.set_trace() + # tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - + # self.pc_range[0]) + self.pc_range[0]) + # tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - + # self.pc_range[1]) + self.pc_range[1]) + # tmp = tmp.reshape(bs, self.num_vec,-1) + # TODO: check if using sigmoid + outputs_coord, outputs_pts_coord = self.transform_box(tmp) + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_pts_coords.append(outputs_pts_coord) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + outputs_pts_coords = torch.stack(outputs_pts_coords) + outs = { + 'bev_embed': bev_embed, + 'all_cls_scores': outputs_classes, + 'all_bbox_preds': outputs_coords, + 'all_pts_preds': outputs_pts_coords, + 'enc_cls_scores': None, + 'enc_bbox_preds': None, + 'enc_pts_preds': None + } + + return outs + def transform_box(self, pts, y_first=False): + """ + Converting the points set into bounding box. + + Args: + pts: the input points sets (fields), each points + set (fields) is represented as 2n scalar. + y_first: if y_fisrt=True, the point set is represented as + [y1, x1, y2, x2 ... yn, xn], otherwise the point set is + represented as [x1, y1, x2, y2 ... xn, yn]. + Returns: + The bbox [cx, cy, w, h] transformed from points. + """ + pts_reshape = pts.view(pts.shape[0], self.num_vec, + self.num_pts_per_vec,2) + pts_y = pts_reshape[:, :, :, 0] if y_first else pts_reshape[:, :, :, 1] + pts_x = pts_reshape[:, :, :, 1] if y_first else pts_reshape[:, :, :, 0] + if self.transform_method == 'minmax': + # import pdb;pdb.set_trace() + + xmin = pts_x.min(dim=2, keepdim=True)[0] + xmax = pts_x.max(dim=2, keepdim=True)[0] + ymin = pts_y.min(dim=2, keepdim=True)[0] + ymax = pts_y.max(dim=2, keepdim=True)[0] + bbox = torch.cat([xmin, ymin, xmax, ymax], dim=2) + bbox = bbox_xyxy_to_cxcywh(bbox) + else: + raise NotImplementedError + return bbox, pts_reshape + def _get_target_single(self, + cls_score, + bbox_pred, + pts_pred, + gt_labels, + gt_bboxes, + gt_shifts_pts, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 4]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + # import pdb;pdb.set_trace() + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_c = gt_bboxes.shape[-1] + # import pdb;pdb.set_trace() + assign_result, order_index = self.assigner.assign(bbox_pred, cls_score, pts_pred, + gt_bboxes, gt_labels, gt_shifts_pts, + gt_bboxes_ignore) + + sampling_result = self.sampler.sample(assign_result, bbox_pred, + gt_bboxes) + # pts_sampling_result = self.sampler.sample(assign_result, pts_pred, + # gt_pts) + + + # import pdb;pdb.set_trace() + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + + # pts targets + # import pdb;pdb.set_trace() + # pts_targets = torch.zeros_like(pts_pred) + # num_query, num_order, num_points, num_coords + if order_index is None: + # import pdb;pdb.set_trace() + assigned_shift = gt_labels[sampling_result.pos_assigned_gt_inds] + else: + assigned_shift = order_index[sampling_result.pos_inds, sampling_result.pos_assigned_gt_inds] + pts_targets = pts_pred.new_zeros((pts_pred.size(0), + pts_pred.size(1), pts_pred.size(2))) + pts_weights = torch.zeros_like(pts_targets) + pts_weights[pos_inds] = 1.0 + + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + pts_targets[pos_inds] = gt_shifts_pts[sampling_result.pos_assigned_gt_inds,assigned_shift,:,:] + return (labels, label_weights, bbox_targets, bbox_weights, + pts_targets, pts_weights, + pos_inds, neg_inds) + + def get_targets(self, + cls_scores_list, + bbox_preds_list, + pts_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_shifts_pts_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pts_targets_list, pts_weights_list, + pos_inds_list, neg_inds_list) = multi_apply( + self._get_target_single, cls_scores_list, bbox_preds_list,pts_preds_list, + gt_labels_list, gt_bboxes_list, gt_shifts_pts_list, gt_bboxes_ignore_list) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pts_targets_list, pts_weights_list, + num_total_pos, num_total_neg) + + def loss_single(self, + cls_scores, + bbox_preds, + pts_preds, + gt_bboxes_list, + gt_labels_list, + gt_shifts_pts_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_pts_list (list[Tensor]): Ground truth pts for each image + with shape (num_gts, fixed_num, 2) in [x,y] format. + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + pts_preds_list = [pts_preds[i] for i in range(num_imgs)] + # import pdb;pdb.set_trace() + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,pts_preds_list, + gt_bboxes_list, gt_labels_list,gt_shifts_pts_list, + gt_bboxes_ignore_list) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + pts_targets_list, pts_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + # import pdb;pdb.set_trace() + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + pts_targets = torch.cat(pts_targets_list, 0) + pts_weights = torch.cat(pts_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # import pdb;pdb.set_trace() + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_2d_bbox(bbox_targets, self.pc_range) + # normalized_bbox_targets = bbox_targets + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.code_weights + + loss_bbox = self.loss_bbox( + bbox_preds[isnotnan, :4], normalized_bbox_targets[isnotnan, + :4], bbox_weights[isnotnan, :4], + avg_factor=num_total_pos) + + # regression pts CD loss + # pts_preds = pts_preds + # import pdb;pdb.set_trace() + + # num_samples, num_order, num_pts, num_coords + normalized_pts_targets = normalize_2d_pts(pts_targets, self.pc_range) + + # num_samples, num_pts, num_coords + pts_preds = pts_preds.reshape(-1, pts_preds.size(-2),pts_preds.size(-1)) + if self.num_pts_per_vec != self.num_pts_per_gt_vec: + pts_preds = pts_preds.permute(0,2,1) + pts_preds = F.interpolate(pts_preds, size=(self.num_pts_per_gt_vec), mode='linear', + align_corners=True) + pts_preds = pts_preds.permute(0,2,1).contiguous() + + # import pdb;pdb.set_trace() + loss_pts = self.loss_pts( + pts_preds[isnotnan,:,:], normalized_pts_targets[isnotnan, + :,:], + pts_weights[isnotnan,:,:], + avg_factor=num_total_pos) + dir_weights = pts_weights[:, :-self.dir_interval,0] + denormed_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) + denormed_pts_preds_dir = denormed_pts_preds[:,self.dir_interval:,:] - denormed_pts_preds[:,:-self.dir_interval,:] + pts_targets_dir = pts_targets[:, self.dir_interval:,:] - pts_targets[:,:-self.dir_interval,:] + # dir_weights = pts_weights[:, indice,:-1,0] + # import pdb;pdb.set_trace() + loss_dir = self.loss_dir( + denormed_pts_preds_dir[isnotnan,:,:], pts_targets_dir[isnotnan, + :,:], + dir_weights[isnotnan,:], + avg_factor=num_total_pos) + + bboxes = denormalize_2d_bbox(bbox_preds, self.pc_range) + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou( + bboxes[isnotnan, :4], bbox_targets[isnotnan, :4], bbox_weights[isnotnan, :4], + avg_factor=num_total_pos) + + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + loss_iou = torch.nan_to_num(loss_iou) + loss_pts = torch.nan_to_num(loss_pts) + loss_dir = torch.nan_to_num(loss_dir) + return loss_cls, loss_bbox, loss_iou, loss_pts, loss_dir + + @force_fp32(apply_to=('preds_dicts')) + def loss(self, + gt_bboxes_list, + gt_labels_list, + preds_dicts, + gt_bboxes_ignore=None, + img_metas=None): + """"Loss function. + Args: + + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + preds_dicts: + all_cls_scores (Tensor): Classification score of all + decoder layers, has shape + [nb_dec, bs, num_query, cls_out_channels]. + all_bbox_preds (Tensor): Sigmoid regression + outputs of all decode layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + [nb_dec, bs, num_query, 4]. + enc_cls_scores (Tensor): Classification scores of + points on encode feature map , has shape + (N, h*w, num_classes). Only be passed when as_two_stage is + True, otherwise is None. + enc_bbox_preds (Tensor): Regression results of each points + on the encode feature map, has shape (N, h*w, 4). Only be + passed when as_two_stage is True, otherwise is None. + gt_bboxes_ignore (list[Tensor], optional): Bounding boxes + which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert gt_bboxes_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + f'for gt_bboxes_ignore setting to None.' + gt_vecs_list = copy.deepcopy(gt_bboxes_list) + # import pdb;pdb.set_trace() + all_cls_scores = preds_dicts['all_cls_scores'] + all_bbox_preds = preds_dicts['all_bbox_preds'] + all_pts_preds = preds_dicts['all_pts_preds'] + enc_cls_scores = preds_dicts['enc_cls_scores'] + enc_bbox_preds = preds_dicts['enc_bbox_preds'] + enc_pts_preds = preds_dicts['enc_pts_preds'] + + num_dec_layers = len(all_cls_scores) + device = gt_labels_list[0].device + + # gt_bboxes_list = [torch.cat( + # (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), + # dim=1).to(device) for gt_bboxes in gt_bboxes_list] + # import pdb;pdb.set_trace() + # gt_bboxes_list = [ + # gt_bboxes.to(device) for gt_bboxes in gt_bboxes_list] + gt_bboxes_list = [ + gt_bboxes.bbox.to(device) for gt_bboxes in gt_vecs_list] + # gt_pts_list = [ + # gt_bboxes.fixed_num_sampled_points.to(device) for gt_bboxes in gt_vecs_list] #3 + if self.gt_shift_pts_pattern == 'v0': + gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points.to(device) for gt_bboxes in gt_vecs_list] + elif self.gt_shift_pts_pattern == 'v1': + gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v1.to(device) for gt_bboxes in gt_vecs_list] + elif self.gt_shift_pts_pattern == 'v2': + gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v2.to(device) for gt_bboxes in gt_vecs_list] + elif self.gt_shift_pts_pattern == 'v3': + gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v3.to(device) for gt_bboxes in gt_vecs_list] + elif self.gt_shift_pts_pattern == 'v4': + gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v4.to(device) for gt_bboxes in gt_vecs_list] + elif self.gt_shift_pts_pattern == 'v5': + gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v5.to(device) for gt_bboxes in gt_vecs_list] #3 + else: + raise NotImplementedError + all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] + all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] + # all_gt_pts_list = [gt_pts_list for _ in range(num_dec_layers)] #3 + all_gt_shifts_pts_list = [gt_shifts_pts_list for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [ + gt_bboxes_ignore for _ in range(num_dec_layers) + ] + # import pdb;pdb.set_trace() + losses_cls, losses_bbox, losses_iou, losses_pts, losses_dir = multi_apply( + self.loss_single, all_cls_scores, all_bbox_preds,all_pts_preds, + all_gt_bboxes_list, all_gt_labels_list,all_gt_shifts_pts_list, + all_gt_bboxes_ignore_list) + + loss_dict = dict() + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + gt_pts_list = [ + gt_bboxes.fixed_num_sampled_points.to(device) for gt_bboxes in gt_vecs_list] #3 + binary_labels_list = [ + torch.zeros_like(gt_labels_list[i]) + for i in range(len(all_gt_labels_list)) + ] + # TODO bug here + enc_loss_cls, enc_losses_bbox, enc_losses_iou, enc_losses_pts, enc_losses_dir = \ + self.loss_single(enc_cls_scores, enc_bbox_preds, enc_pts_preds, + gt_bboxes_list, binary_labels_list, gt_pts_list,gt_bboxes_ignore) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + loss_dict['enc_losses_iou'] = enc_losses_iou + loss_dict['enc_losses_pts'] = enc_losses_pts + loss_dict['enc_losses_dir'] = enc_losses_dir + + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + loss_dict['loss_iou'] = losses_iou[-1] + loss_dict['loss_pts'] = losses_pts[-1] + loss_dict['loss_dir'] = losses_dir[-1] + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i, loss_iou_i, loss_pts_i, loss_dir_i in zip(losses_cls[:-1], + losses_bbox[:-1], + losses_iou[:-1], + losses_pts[:-1], + losses_dir[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i + loss_dict[f'd{num_dec_layer}.loss_pts'] = loss_pts_i + loss_dict[f'd{num_dec_layer}.loss_dir'] = loss_dir_i + num_dec_layer += 1 + return loss_dict + + @force_fp32(apply_to=('preds_dicts')) + def get_bboxes(self, preds_dicts, img_metas, rescale=False): + """Generate bboxes from bbox head predictions. + Args: + preds_dicts (tuple[list[dict]]): Prediction results. + img_metas (list[dict]): Point cloud and image's meta info. + Returns: + list[dict]: Decoded bbox, scores and labels after nms. + """ + # bboxes: xmin, ymin, xmax, ymax + preds_dicts = self.bbox_coder.decode(preds_dicts) + + num_samples = len(preds_dicts) + ret_list = [] + for i in range(num_samples): + preds = preds_dicts[i] + bboxes = preds['bboxes'] + # bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 + + # code_size = bboxes.shape[-1] + # bboxes = img_metas[i]['box_type_3d'](bboxes, code_size) + scores = preds['scores'] + labels = preds['labels'] + pts = preds['pts'] + + ret_list.append([bboxes, scores, labels, pts]) + + return ret_list + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eade3c210c6bcb2d3bc3da78a684ced2be285b0a --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/__init__.py @@ -0,0 +1 @@ +from .maptr import MapTR \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/maptr.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/maptr.py new file mode 100644 index 0000000000000000000000000000000000000000..9c328bb8898f27ded186b044152be22f60271bac --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/maptr.py @@ -0,0 +1,445 @@ +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +from mmcv.runner import force_fp32, auto_fp16 +from mmdet3d.ops import Voxelization, DynamicScatter +from mmdet3d.models import builder +@DETECTORS.register_module() +class MapTR(MVXTwoStageDetector): + """MapTR. + Args: + video_test_mode (bool): Decide whether to use temporal information during inference. + """ + + def __init__(self, + use_grid_mask=False, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_roi_head=None, + img_rpn_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + video_test_mode=False, + modality='vision', + lidar_encoder=None, + ): + + super(MapTR, + self).__init__(pts_voxel_layer, pts_voxel_encoder, + pts_middle_encoder, pts_fusion_layer, + img_backbone, pts_backbone, img_neck, pts_neck, + pts_bbox_head, img_roi_head, img_rpn_head, + train_cfg, test_cfg, pretrained) + self.grid_mask = GridMask( + True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) + self.use_grid_mask = use_grid_mask + self.fp16_enabled = False + + # temporal + self.video_test_mode = video_test_mode + self.prev_frame_info = { + 'prev_bev': None, + 'scene_token': None, + 'prev_pos': 0, + 'prev_angle': 0, + } + self.modality = modality + if self.modality == 'fusion' and lidar_encoder is not None : + if lidar_encoder["voxelize"].get("max_num_points", -1) > 0: + voxelize_module = Voxelization(**lidar_encoder["voxelize"]) + else: + voxelize_module = DynamicScatter(**lidar_encoder["voxelize"]) + self.lidar_modal_extractor = nn.ModuleDict( + { + "voxelize": voxelize_module, + "backbone": builder.build_middle_encoder(lidar_encoder["backbone"]), + } + ) + self.voxelize_reduce = lidar_encoder.get("voxelize_reduce", True) + + + def extract_img_feat(self, img, img_metas, len_queue=None): + """Extract features of images.""" + B = img.size(0) + if img is not None: + + # input_shape = img.shape[-2:] + # # update real input shape of each single img + # for img_meta in img_metas: + # img_meta.update(input_shape=input_shape) + + if img.dim() == 5 and img.size(0) == 1: + img.squeeze_() + elif img.dim() == 5 and img.size(0) > 1: + B, N, C, H, W = img.size() + img = img.reshape(B * N, C, H, W) + if self.use_grid_mask: + img = self.grid_mask(img) + + img_feats = self.img_backbone(img) + if isinstance(img_feats, dict): + img_feats = list(img_feats.values()) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + img_feats_reshaped = [] + for img_feat in img_feats: + BN, C, H, W = img_feat.size() + if len_queue is not None: + img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W)) + else: + img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) + return img_feats_reshaped + + @auto_fp16(apply_to=('img'), out_fp32=True) + def extract_feat(self, img, img_metas=None, len_queue=None): + """Extract features from images and points.""" + + img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue) + + return img_feats + + + def forward_pts_train(self, + pts_feats, + lidar_feat, + gt_bboxes_3d, + gt_labels_3d, + img_metas, + gt_bboxes_ignore=None, + prev_bev=None): + """Forward function' + Args: + pts_feats (list[torch.Tensor]): Features of point cloud branch + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + boxes for each sample. + gt_labels_3d (list[torch.Tensor]): Ground truth labels for + boxes of each sampole + img_metas (list[dict]): Meta information of samples. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + boxes to be ignored. Defaults to None. + prev_bev (torch.Tensor, optional): BEV features of previous frame. + Returns: + dict: Losses of each branch. + """ + + outs = self.pts_bbox_head( + pts_feats, lidar_feat, img_metas, prev_bev) + loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] + losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas) + return losses + + def forward_dummy(self, img): + dummy_metas = None + return self.forward_test(img=img, img_metas=[[dummy_metas]]) + + def forward(self, return_loss=True, **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. + Note this setting will change the expected inputs. When + `return_loss=True`, img and img_metas are single-nested (i.e. + torch.Tensor and list[dict]), and when `resturn_loss=False`, img and + img_metas should be double nested (i.e. list[torch.Tensor], + list[list[dict]]), with the outer list indicating test time + augmentations. + """ + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def obtain_history_bev(self, imgs_queue, img_metas_list): + """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated. + """ + self.eval() + + with torch.no_grad(): + prev_bev = None + bs, len_queue, num_cams, C, H, W = imgs_queue.shape + imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W) + img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue) + for i in range(len_queue): + img_metas = [each[i] for each in img_metas_list] + if not img_metas[0]['prev_bev_exists']: + prev_bev = None + img_feats = [each_scale[:, i] for each_scale in img_feats_list] + prev_bev = self.pts_bbox_head( + img_feats, None, img_metas, prev_bev, only_bev=True) + self.train() + return prev_bev + + @torch.no_grad() + @force_fp32() + def voxelize(self, points): + feats, coords, sizes = [], [], [] + for k, res in enumerate(points): + ret = self.lidar_modal_extractor["voxelize"](res) + if len(ret) == 3: + # hard voxelize + f, c, n = ret + else: + assert len(ret) == 2 + f, c = ret + n = None + feats.append(f) + coords.append(F.pad(c, (1, 0), mode="constant", value=k)) + if n is not None: + sizes.append(n) + + feats = torch.cat(feats, dim=0) + coords = torch.cat(coords, dim=0) + if len(sizes) > 0: + sizes = torch.cat(sizes, dim=0) + if self.voxelize_reduce: + feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view( + -1, 1 + ) + feats = feats.contiguous() + + return feats, coords, sizes + @auto_fp16(apply_to=('points'), out_fp32=True) + def extract_lidar_feat(self,points): + feats, coords, sizes = self.voxelize(points) + # voxel_features = self.lidar_modal_extractor["voxel_encoder"](feats, sizes, coords) + batch_size = coords[-1, 0] + 1 + lidar_feat = self.lidar_modal_extractor["backbone"](feats, coords, batch_size, sizes=sizes) + + return lidar_feat + + # @auto_fp16(apply_to=('img', 'points')) + @force_fp32(apply_to=('img','points','prev_bev')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + lidar_feat = None + if self.modality == 'fusion': + lidar_feat = self.extract_lidar_feat(points) + + len_queue = img.size(1) + prev_img = img[:, :-1, ...] + img = img[:, -1, ...] + + prev_img_metas = copy.deepcopy(img_metas) + # prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) + # import pdb;pdb.set_trace() + prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) if len_queue>1 else None + + img_metas = [each[len_queue-1] for each in img_metas] + if not img_metas[0]['prev_bev_exists']: + prev_bev = None + img_feats = self.extract_feat(img=img, img_metas=img_metas) + losses = dict() + losses_pts = self.forward_pts_train(img_feats, lidar_feat, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev) + + losses.update(losses_pts) + return losses + + def forward_test(self, img_metas, img=None,points=None, **kwargs): + for var, name in [(img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError('{} must be a list, but got {}'.format( + name, type(var))) + img = [img] if img is None else img + points = [points] if points is None else points + if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']: + # the first sample of each scene is truncated + self.prev_frame_info['prev_bev'] = None + # update idx + self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token'] + + # do not use temporal information + if not self.video_test_mode: + self.prev_frame_info['prev_bev'] = None + + # Get the delta of ego position and angle between two timestamps. + tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3]) + tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1]) + if self.prev_frame_info['prev_bev'] is not None: + img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos'] + img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle'] + else: + img_metas[0][0]['can_bus'][-1] = 0 + img_metas[0][0]['can_bus'][:3] = 0 + + new_prev_bev, bbox_results = self.simple_test( + img_metas[0], img[0], points[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs) + # During inference, we save the BEV features and ego motion of each timestamp. + self.prev_frame_info['prev_pos'] = tmp_pos + self.prev_frame_info['prev_angle'] = tmp_angle + self.prev_frame_info['prev_bev'] = new_prev_bev + return bbox_results + + def pred2result(self, bboxes, scores, labels, pts, attrs=None): + """Convert detection results to a list of numpy arrays. + + Args: + bboxes (torch.Tensor): Bounding boxes with shape of (n, 5). + labels (torch.Tensor): Labels with shape of (n, ). + scores (torch.Tensor): Scores with shape of (n, ). + attrs (torch.Tensor, optional): Attributes with shape of (n, ). \ + Defaults to None. + + Returns: + dict[str, torch.Tensor]: Bounding box results in cpu mode. + + - boxes_3d (torch.Tensor): 3D boxes. + - scores (torch.Tensor): Prediction scores. + - labels_3d (torch.Tensor): Box labels. + - attrs_3d (torch.Tensor, optional): Box attributes. + """ + result_dict = dict( + boxes_3d=bboxes.to('cpu'), + scores_3d=scores.cpu(), + labels_3d=labels.cpu(), + pts_3d=pts.to('cpu')) + + if attrs is not None: + result_dict['attrs_3d'] = attrs.cpu() + + return result_dict + def simple_test_pts(self, x, lidar_feat, img_metas, prev_bev=None, rescale=False): + """Test function""" + outs = self.pts_bbox_head(x, lidar_feat, img_metas, prev_bev=prev_bev) + + bbox_list = self.pts_bbox_head.get_bboxes( + outs, img_metas, rescale=rescale) + + bbox_results = [ + self.pred2result(bboxes, scores, labels, pts) + for bboxes, scores, labels, pts in bbox_list + ] + # import pdb;pdb.set_trace() + return outs['bev_embed'], bbox_results + def simple_test(self, img_metas, img=None, points=None, prev_bev=None, rescale=False, **kwargs): + """Test function without augmentaiton.""" + lidar_feat = None + if self.modality =='fusion': + lidar_feat = self.extract_lidar_feat(points) + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + bbox_list = [dict() for i in range(len(img_metas))] + new_prev_bev, bbox_pts = self.simple_test_pts( + img_feats, lidar_feat, img_metas, prev_bev, rescale=rescale) + for result_dict, pts_bbox in zip(bbox_list, bbox_pts): + result_dict['pts_bbox'] = pts_bbox + return new_prev_bev, bbox_list + + +@DETECTORS.register_module() +class MapTR_fp16(MapTR): + """ + The default version BEVFormer currently can not support FP16. + We provide this version to resolve this issue. + """ + # @auto_fp16(apply_to=('img', 'prev_bev', 'points')) + @force_fp32(apply_to=('img','points','prev_bev')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + prev_bev=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + img_feats = self.extract_feat(img=img, img_metas=img_metas) + # import pdb;pdb.set_trace() + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev=prev_bev) + losses.update(losses_pts) + return losses + + + def val_step(self, data, optimizer): + """ + In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`. + This is not the standard function of `val_step`. + """ + + img = data['img'] + img_metas = data['img_metas'] + img_feats = self.extract_feat(img=img, img_metas=img_metas) + prev_bev = data.get('prev_bev', None) + prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True) + return prev_bev diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e4b4dba2aaf4436ae886c65df2ee8743ad66628b --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/__init__.py @@ -0,0 +1,6 @@ +from .map_loss import MyChamferDistance +from .map_loss import MyChamferDistanceCost +from .map_loss import OrderedPtsL1Cost, PtsL1Cost +from .map_loss import OrderedPtsL1Loss, PtsL1Loss +from .map_loss import OrderedPtsSmoothL1Cost, OrderedPtsL1Loss +from .map_loss import PtsDirCosLoss \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/map_loss.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/map_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..ed18fc58734de15957235443621d26c7d7785dcd --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/map_loss.py @@ -0,0 +1,718 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn as nn +from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss + +from mmdet.models.builder import LOSSES +from mmdet.models import weighted_loss +import mmcv +import torch.nn.functional as F +from mmdet.core.bbox.match_costs.builder import MATCH_COST +import functools + + +def reduce_loss(loss, reduction): + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() + +@mmcv.jit(derivate=True, coderize=True) +def custom_weight_dir_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): num_sample, num_dir + weight (Tensor): Element-wise weights. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Average factor when computing the mean of losses. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + raise ValueError('avg_factor should not be none for OrderedPtsL1Loss') + # loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # import pdb;pdb.set_trace() + # loss = loss.permute(1,0,2,3).contiguous() + loss = loss.sum() + loss = loss / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + +@mmcv.jit(derivate=True, coderize=True) +def custom_weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): num_sample, num_order, num_pts, num_coords + weight (Tensor): Element-wise weights. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Average factor when computing the mean of losses. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + raise ValueError('avg_factor should not be none for OrderedPtsL1Loss') + # loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # import pdb;pdb.set_trace() + loss = loss.permute(1,0,2,3).contiguous() + loss = loss.sum((1,2,3)) + loss = loss / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + +def custom_weighted_loss(loss_func): + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred, + target, + weight=None, + reduction='mean', + avg_factor=None, + **kwargs): + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = custom_weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper + + +def custom_weighted_dir_loss(loss_func): + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred, + target, + weight=None, + reduction='mean', + avg_factor=None, + **kwargs): + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = custom_weight_dir_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper + +@mmcv.jit(derivate=True, coderize=True) +@custom_weighted_loss +def ordered_pts_smooth_l1_loss(pred, target): + """L1 loss. + + Args: + pred (torch.Tensor): shape [num_samples, num_pts, num_coords] + target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords] + + Returns: + torch.Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1) + assert pred.size() == target.size() + loss =smooth_l1_loss(pred,target, reduction='none') + # import pdb;pdb.set_trace() + return loss + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def pts_l1_loss(pred, target): + """L1 loss. + + Args: + pred (torch.Tensor): shape [num_samples, num_pts, num_coords] + target (torch.Tensor): shape [num_samples, num_pts, num_coords] + + Returns: + torch.Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + assert pred.size() == target.size() + loss = torch.abs(pred - target) + return loss + +@mmcv.jit(derivate=True, coderize=True) +@custom_weighted_loss +def ordered_pts_l1_loss(pred, target): + """L1 loss. + + Args: + pred (torch.Tensor): shape [num_samples, num_pts, num_coords] + target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords] + + Returns: + torch.Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1) + assert pred.size() == target.size() + loss = torch.abs(pred - target) + return loss + +@mmcv.jit(derivate=True, coderize=True) +@custom_weighted_dir_loss +def pts_dir_cos_loss(pred, target): + """ Dir cosine similiarity loss + pred (torch.Tensor): shape [num_samples, num_dir, num_coords] + target (torch.Tensor): shape [num_samples, num_dir, num_coords] + + """ + if target.numel() == 0: + return pred.sum() * 0 + # import pdb;pdb.set_trace() + num_samples, num_dir, num_coords = pred.shape + loss_func = torch.nn.CosineEmbeddingLoss(reduction='none') + tgt_param = target.new_ones((num_samples, num_dir)) + tgt_param = tgt_param.flatten(0) + loss = loss_func(pred.flatten(0,1), target.flatten(0,1), tgt_param) + loss = loss.view(num_samples, num_dir) + return loss + +@LOSSES.register_module() +class OrderedPtsSmoothL1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(OrderedPtsSmoothL1Loss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_bbox = self.loss_weight * ordered_pts_smooth_l1_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + +@LOSSES.register_module() +class PtsDirCosLoss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(PtsDirCosLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_dir = self.loss_weight * pts_dir_cos_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_dir + + + +@LOSSES.register_module() +class PtsL1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(PtsL1Loss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_bbox = self.loss_weight * pts_l1_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox + +@LOSSES.register_module() +class OrderedPtsL1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(OrderedPtsL1Loss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_bbox = self.loss_weight * ordered_pts_l1_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + + + +@MATCH_COST.register_module() +class OrderedPtsSmoothL1Cost(object): + """OrderedPtsL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (x, y), which are all in range [0, 1]. Shape + [num_query, num_pts, 2]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x,y). + Shape [num_gt, num_ordered, num_pts, 2]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape + # import pdb;pdb.set_trace() + bbox_pred = bbox_pred.view(bbox_pred.size(0),-1).unsqueeze(1).repeat(1,num_gts*num_orders,1) + gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1).unsqueeze(0).repeat(bbox_pred.size(0),1,1) + # import pdb;pdb.set_trace() + bbox_cost = smooth_l1_loss(bbox_pred, gt_bboxes, reduction='none').sum(-1) + # bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight + +@MATCH_COST.register_module() +class PtsL1Cost(object): + """OrderedPtsL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (x, y), which are all in range [0, 1]. Shape + [num_query, num_pts, 2]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x,y). + Shape [num_gt, num_ordered, num_pts, 2]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + num_gts, num_pts, num_coords = gt_bboxes.shape + # import pdb;pdb.set_trace() + bbox_pred = bbox_pred.view(bbox_pred.size(0),-1) + gt_bboxes = gt_bboxes.view(num_gts,-1) + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight + +@MATCH_COST.register_module() +class OrderedPtsL1Cost(object): + """OrderedPtsL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (x, y), which are all in range [0, 1]. Shape + [num_query, num_pts, 2]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x,y). + Shape [num_gt, num_ordered, num_pts, 2]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape + # import pdb;pdb.set_trace() + bbox_pred = bbox_pred.view(bbox_pred.size(0),-1) + gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1) + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight + +@MATCH_COST.register_module() +class MyChamferDistanceCost: + def __init__(self, loss_src_weight=1., loss_dst_weight=1.): + # assert mode in ['smooth_l1', 'l1', 'l2'] + # self.mode = mode + self.loss_src_weight = loss_src_weight + self.loss_dst_weight = loss_dst_weight + + def __call__(self, src, dst,src_weight=1.0,dst_weight=1.0,): + """ + pred_pts (Tensor): normed coordinate(x,y), shape (num_q, num_pts_M, 2) + gt_pts (Tensor): normed coordinate(x,y), shape (num_gt, num_pts_N, 2) + """ + # criterion_mode = self.mode + # if criterion_mode == 'smooth_l1': + # criterion = smooth_l1_loss + # elif criterion_mode == 'l1': + # criterion = l1_loss + # elif criterion_mode == 'l2': + # criterion = mse_loss + # else: + # raise NotImplementedError + # import pdb;pdb.set_trace() + src_expand = src.unsqueeze(1).repeat(1,dst.shape[0],1,1) + dst_expand = dst.unsqueeze(0).repeat(src.shape[0],1,1,1) + # src_expand = src.unsqueeze(2).unsqueeze(1).repeat(1,dst.shape[0], 1, dst.shape[1], 1) + # dst_expand = dst.unsqueeze(1).unsqueeze(0).repeat(src.shape[0],1, src.shape[1], 1, 1) + distance = torch.cdist(src_expand, dst_expand) + src2dst_distance = torch.min(distance, dim=3)[0] # (num_q, num_gt, num_pts_N) + dst2src_distance = torch.min(distance, dim=2)[0] # (num_q, num_gt, num_pts_M) + loss_src = (src2dst_distance * src_weight).mean(-1) + loss_dst = (dst2src_distance * dst_weight).mean(-1) + loss = loss_src*self.loss_src_weight + loss_dst * self.loss_dst_weight + return loss + +@mmcv.jit(derivate=True, coderize=True) +def chamfer_distance(src, + dst, + src_weight=1.0, + dst_weight=1.0, + # criterion_mode='l1', + reduction='mean', + avg_factor=None): + """Calculate Chamfer Distance of two sets. + + Args: + src (torch.Tensor): Source set with shape [B, N, C] to + calculate Chamfer Distance. + dst (torch.Tensor): Destination set with shape [B, M, C] to + calculate Chamfer Distance. + src_weight (torch.Tensor or float): Weight of source loss. + dst_weight (torch.Tensor or float): Weight of destination loss. + criterion_mode (str): Criterion mode to calculate distance. + The valid modes are smooth_l1, l1 or l2. + reduction (str): Method to reduce losses. + The valid reduction method are 'none', 'sum' or 'mean'. + + Returns: + tuple: Source and Destination loss with the corresponding indices. + + - loss_src (torch.Tensor): The min distance \ + from source to destination. + - loss_dst (torch.Tensor): The min distance \ + from destination to source. + - indices1 (torch.Tensor): Index the min distance point \ + for each point in source to destination. + - indices2 (torch.Tensor): Index the min distance point \ + for each point in destination to source. + """ + + # if criterion_mode == 'smooth_l1': + # criterion = smooth_l1_loss + # elif criterion_mode == 'l1': + # criterion = l1_loss + # elif criterion_mode == 'l2': + # criterion = mse_loss + # else: + # raise NotImplementedError + + # src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1) + # dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1) + # import pdb;pdb.set_trace() + distance = torch.cdist(src, dst) + src2dst_distance, indices1 = torch.min(distance, dim=2) # (B,N) + dst2src_distance, indices2 = torch.min(distance, dim=1) # (B,M) + # import pdb;pdb.set_trace() + #TODO this may be wrong for misaligned src_weight, now[N,fixed_num] + # should be [N], then view + loss_src = (src2dst_distance * src_weight) + loss_dst = (dst2src_distance * dst_weight) + if avg_factor is None: + reduction_enum = F._Reduction.get_enum(reduction) + if reduction_enum == 0: + raise ValueError('MyCDLoss can not be used with reduction=`none`') + elif reduction_enum == 1: + loss_src = loss_src.mean(-1).mean() + loss_dst = loss_dst.mean(-1).mean() + elif reduction_enum == 2: + loss_src = loss_src.mean(-1).sum() + loss_dst = loss_dst.mean(-1).sum() + else: + raise NotImplementedError + else: + if reduction == 'mean': + eps = torch.finfo(torch.float32).eps + loss_src = loss_src.mean(-1).sum() / (avg_factor + eps) + loss_dst = loss_dst.mean(-1).sum() / (avg_factor + eps) + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + + return loss_src, loss_dst, indices1, indices2 + + +@LOSSES.register_module() +class MyChamferDistance(nn.Module): + """Calculate Chamfer Distance of two sets. + + Args: + mode (str): Criterion mode to calculate distance. + The valid modes are smooth_l1, l1 or l2. + reduction (str): Method to reduce losses. + The valid reduction method are none, sum or mean. + loss_src_weight (float): Weight of loss_source. + loss_dst_weight (float): Weight of loss_target. + """ + + def __init__(self, + # mode='l1', + reduction='mean', + loss_src_weight=1.0, + loss_dst_weight=1.0): + super(MyChamferDistance, self).__init__() + + # assert mode in ['smooth_l1', 'l1', 'l2'] + assert reduction in ['none', 'sum', 'mean'] + # self.mode = mode + self.reduction = reduction + self.loss_src_weight = loss_src_weight + self.loss_dst_weight = loss_dst_weight + + def forward(self, + source, + target, + src_weight=1.0, + dst_weight=1.0, + avg_factor=None, + reduction_override=None, + return_indices=False, + **kwargs): + """Forward function of loss calculation. + + Args: + source (torch.Tensor): Source set with shape [B, N, C] to + calculate Chamfer Distance. + target (torch.Tensor): Destination set with shape [B, M, C] to + calculate Chamfer Distance. + src_weight (torch.Tensor | float, optional): + Weight of source loss. Defaults to 1.0. + dst_weight (torch.Tensor | float, optional): + Weight of destination loss. Defaults to 1.0. + reduction_override (str, optional): Method to reduce losses. + The valid reduction method are 'none', 'sum' or 'mean'. + Defaults to None. + return_indices (bool, optional): Whether to return indices. + Defaults to False. + + Returns: + tuple[torch.Tensor]: If ``return_indices=True``, return losses of \ + source and target with their corresponding indices in the \ + order of ``(loss_source, loss_target, indices1, indices2)``. \ + If ``return_indices=False``, return \ + ``(loss_source, loss_target)``. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + loss_source, loss_target, indices1, indices2 = chamfer_distance( + source, target, src_weight, dst_weight, reduction, + avg_factor=avg_factor) + + loss_source *= self.loss_src_weight + loss_target *= self.loss_dst_weight + + loss_pts = loss_source + loss_target + + if return_indices: + return loss_pts, indices1, indices2 + else: + return loss_pts diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b7a4c1d2f64673308e960a336c4fb128c09477aa --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/__init__.py @@ -0,0 +1,5 @@ +from .transformer import MapTRPerceptionTransformer +from .decoder import MapTRDecoder +# from .geometry_kernel_attention import GeometrySptialCrossAttention, GeometryKernelAttention #3 +from .builder import build_fuser +# from .encoder import LSSTransform #3 \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/builder.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..99f9b41592053e1e1bde5be289bf0584d45f7db6 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/builder.py @@ -0,0 +1,5 @@ +import torch.nn as nn +from mmcv.utils import Registry, build_from_cfg +FUSERS = Registry("fusers") +def build_fuser(cfg): + return FUSERS.build(cfg) \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/decoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..d23e470a7ec6dc5fdf6f2b951c7caaa4c9da650d --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/decoder.py @@ -0,0 +1,84 @@ +import torch +from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +from mmdet.models.utils.transformer import inverse_sigmoid + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class MapTRDecoder(TransformerLayerSequence): + """Implements the decoder in DETR3D transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + super(MapTRDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.fp16_enabled = False + + def forward(self, + query, + *args, + reference_points=None, + reg_branches=None, + key_padding_mask=None, + **kwargs): + """Forward function for `Detr3DTransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + + reference_points_input = reference_points[..., :2].unsqueeze( + 2) # BS NUM_QUERY NUM_LEVEL 2 + output = layer( + output, + *args, + reference_points=reference_points_input, + key_padding_mask=key_padding_mask, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + + assert reference_points.shape[-1] == 2 + + new_reference_points = torch.zeros_like(reference_points) + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points[..., :2]) + # new_reference_points[..., 2:3] = tmp[ + # ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) + + new_reference_points = new_reference_points.sigmoid() + + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/encoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..37dca1da112d72dc7a6c13eb00211d64e2c1e9da --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/encoder.py @@ -0,0 +1,355 @@ +import torch +import numpy as np +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +import torch.nn as nn +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmdet3d.ops import bev_pool +from mmcv.runner import force_fp32, auto_fp16 + +def gen_dx_bx(xbound, ybound, zbound): + dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]]) + bx = torch.Tensor([row[0] + row[2] / 2.0 for row in [xbound, ybound, zbound]]) + nx = torch.Tensor( + [int((row[1] - row[0]) / row[2]) for row in [xbound, ybound, zbound]] + ) + return dx, bx, nx + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class BaseTransform(BaseModule): + def __init__( + self, + in_channels, + out_channels, + feat_down_sample, + pc_range, + voxel_size, + dbound, + ): + super(BaseTransform, self).__init__() + self.in_channels = in_channels + self.feat_down_sample = feat_down_sample + # self.image_size = image_size + # self.feature_size = feature_size + self.xbound = [pc_range[0],pc_range[3], voxel_size[0]] + self.ybound = [pc_range[1],pc_range[4], voxel_size[1]] + self.zbound = [pc_range[2],pc_range[5], voxel_size[2]] + self.dbound = dbound + + dx, bx, nx = gen_dx_bx(self.xbound, self.ybound, self.zbound) + self.dx = nn.Parameter(dx, requires_grad=False) + self.bx = nn.Parameter(bx, requires_grad=False) + self.nx = nn.Parameter(nx, requires_grad=False) + + self.C = out_channels + self.frustum = None + self.D = int((dbound[1] - dbound[0]) / dbound[2]) + # self.frustum = self.create_frustum() + # self.D = self.frustum.shape[0] + self.fp16_enabled = False + + @force_fp32() + def create_frustum(self,fH,fW,img_metas): + # iH, iW = self.image_size + # fH, fW = self.feature_size + iH = img_metas[0]['img_shape'][0][0] + iW = img_metas[0]['img_shape'][0][1] + assert iH // self.feat_down_sample == fH + # import pdb;pdb.set_trace() + ds = ( + torch.arange(*self.dbound, dtype=torch.float) + .view(-1, 1, 1) + .expand(-1, fH, fW) + ) + D, _, _ = ds.shape + + xs = ( + torch.linspace(0, iW - 1, fW, dtype=torch.float) + .view(1, 1, fW) + .expand(D, fH, fW) + ) + ys = ( + torch.linspace(0, iH - 1, fH, dtype=torch.float) + .view(1, fH, 1) + .expand(D, fH, fW) + ) + + frustum = torch.stack((xs, ys, ds), -1) + # return nn.Parameter(frustum, requires_grad=False) + return frustum + @force_fp32() + def get_geometry_v1( + self, + fH, + fW, + rots, + trans, + intrins, + post_rots, + post_trans, + lidar2ego_rots, + lidar2ego_trans, + img_metas, + **kwargs, + ): + B, N, _ = trans.shape + device = trans.device + if self.frustum == None: + self.frustum = self.create_frustum(fH,fW,img_metas) + self.frustum = self.frustum.to(device) + # self.D = self.frustum.shape[0] + + # undo post-transformation + # B x N x D x H x W x 3 + points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3) + points = ( + torch.inverse(post_rots) + .view(B, N, 1, 1, 1, 3, 3) + .matmul(points.unsqueeze(-1)) + ) + # cam_to_ego + points = torch.cat( + ( + points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], + points[:, :, :, :, :, 2:3], + ), + 5, + ) + combine = rots.matmul(torch.inverse(intrins)) + points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) + points += trans.view(B, N, 1, 1, 1, 3) + # ego_to_lidar + points -= lidar2ego_trans.view(B, 1, 1, 1, 1, 3) + points = ( + torch.inverse(lidar2ego_rots) + .view(B, 1, 1, 1, 1, 3, 3) + .matmul(points.unsqueeze(-1)) + .squeeze(-1) + ) + + if "extra_rots" in kwargs: + extra_rots = kwargs["extra_rots"] + points = ( + extra_rots.view(B, 1, 1, 1, 1, 3, 3) + .repeat(1, N, 1, 1, 1, 1, 1) + .matmul(points.unsqueeze(-1)) + .squeeze(-1) + ) + if "extra_trans" in kwargs: + extra_trans = kwargs["extra_trans"] + points += extra_trans.view(B, 1, 1, 1, 1, 3).repeat(1, N, 1, 1, 1, 1) + + return points + + @force_fp32() + def get_geometry( + self, + fH, + fW, + lidar2img, + img_metas, + ): + B, N, _, _ = lidar2img.shape + device = lidar2img.device + # import pdb;pdb.set_trace() + if self.frustum == None: + self.frustum = self.create_frustum(fH,fW,img_metas) + self.frustum = self.frustum.to(device) + # self.D = self.frustum.shape[0] + + points = self.frustum.view(1,1,self.D, fH, fW, 3) \ + .repeat(B,N,1,1,1,1) + lidar2img = lidar2img.view(B,N,1,1,1,4,4) + # img2lidar = torch.inverse(lidar2img) + points = torch.cat( + (points, torch.ones_like(points[..., :1])), -1) + points = torch.linalg.solve(lidar2img.to(torch.float32), + points.unsqueeze(-1).to(torch.float32)).squeeze(-1) + # points = torch.matmul(img2lidar.to(torch.float32), + # points.unsqueeze(-1).to(torch.float32)).squeeze(-1) + # import pdb;pdb.set_trace() + eps = 1e-5 + points = points[..., 0:3] / torch.maximum( + points[..., 3:4], torch.ones_like(points[..., 3:4]) * eps) + + return points + + def get_cam_feats(self, x): + raise NotImplementedError + + @force_fp32() + def bev_pool(self, geom_feats, x): + B, N, D, H, W, C = x.shape + Nprime = B * N * D * H * W + + # flatten x + x = x.reshape(Nprime, C) + + # flatten indices + geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long() + geom_feats = geom_feats.view(Nprime, 3) + batch_ix = torch.cat( + [ + torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long) + for ix in range(B) + ] + ) + geom_feats = torch.cat((geom_feats, batch_ix), 1) + + # filter out points that are outside box + kept = ( + (geom_feats[:, 0] >= 0) + & (geom_feats[:, 0] < self.nx[0]) + & (geom_feats[:, 1] >= 0) + & (geom_feats[:, 1] < self.nx[1]) + & (geom_feats[:, 2] >= 0) + & (geom_feats[:, 2] < self.nx[2]) + ) + x = x[kept] + geom_feats = geom_feats[kept] + + x = bev_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1]) + + # collapse Z + final = torch.cat(x.unbind(dim=2), 1) + + return final + + @force_fp32() + def forward( + self, + images, + img_metas + ): + B, N, C, fH, fW = images.shape + lidar2img = [] + camera2ego = [] + camera_intrinsics = [] + img_aug_matrix = [] + lidar2ego = [] + + for img_meta in img_metas: + lidar2img.append(img_meta['lidar2img']) + camera2ego.append(img_meta['camera2ego']) + camera_intrinsics.append(img_meta['camera_intrinsics']) + img_aug_matrix.append(img_meta['img_aug_matrix']) + lidar2ego.append(img_meta['lidar2ego']) + lidar2img = np.asarray(lidar2img) + lidar2img = images.new_tensor(lidar2img) # (B, N, 4, 4) + camera2ego = np.asarray(camera2ego) + camera2ego = images.new_tensor(camera2ego) # (B, N, 4, 4) + camera_intrinsics = np.asarray(camera_intrinsics) + camera_intrinsics = images.new_tensor(camera_intrinsics) # (B, N, 4, 4) + img_aug_matrix = np.asarray(img_aug_matrix) + img_aug_matrix = images.new_tensor(img_aug_matrix) # (B, N, 4, 4) + lidar2ego = np.asarray(lidar2ego) + lidar2ego = images.new_tensor(lidar2ego) # (B, N, 4, 4) + + # import pdb;pdb.set_trace() + # lidar2cam = torch.linalg.solve(camera2ego, lidar2ego.view(B,1,4,4).repeat(1,N,1,1)) + # lidar2oriimg = torch.matmul(camera_intrinsics,lidar2cam) + # mylidar2img = torch.matmul(img_aug_matrix,lidar2oriimg) + + + + rots = camera2ego[..., :3, :3] + trans = camera2ego[..., :3, 3] + intrins = camera_intrinsics[..., :3, :3] + post_rots = img_aug_matrix[..., :3, :3] + post_trans = img_aug_matrix[..., :3, 3] + lidar2ego_rots = lidar2ego[..., :3, :3] + lidar2ego_trans = lidar2ego[..., :3, 3] + + # tmpgeom = self.get_geometry( + # fH, + # fW, + # mylidar2img, + # img_metas, + # ) + + geom = self.get_geometry_v1( + fH, + fW, + rots, + trans, + intrins, + post_rots, + post_trans, + lidar2ego_rots, + lidar2ego_trans, + img_metas + ) + x = self.get_cam_feats(images) + x = self.bev_pool(geom, x) + # import pdb;pdb.set_trace() + x = x.permute(0,1,3,2).contiguous() + + return x + + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class LSSTransform(BaseTransform): + def __init__( + self, + in_channels, + out_channels, + feat_down_sample, + pc_range, + voxel_size, + dbound, + downsample=1, + ): + super(LSSTransform, self).__init__( + in_channels=in_channels, + out_channels=out_channels, + feat_down_sample=feat_down_sample, + pc_range=pc_range, + voxel_size=voxel_size, + dbound=dbound, + ) + # import pdb;pdb.set_trace() + self.depthnet = nn.Conv2d(in_channels, int(self.D + self.C), 1) + if downsample > 1: + assert downsample == 2, downsample + self.downsample = nn.Sequential( + nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(True), + nn.Conv2d( + out_channels, + out_channels, + 3, + stride=downsample, + padding=1, + bias=False, + ), + nn.BatchNorm2d(out_channels), + nn.ReLU(True), + nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(True), + ) + else: + self.downsample = nn.Identity() + + @force_fp32() + def get_cam_feats(self, x): + B, N, C, fH, fW = x.shape + + x = x.view(B * N, C, fH, fW) + + x = self.depthnet(x) + depth = x[:, : self.D].softmax(dim=1) + x = depth.unsqueeze(1) * x[:, self.D : (self.D + self.C)].unsqueeze(2) + + x = x.view(B, N, self.C, self.D, fH, fW) + x = x.permute(0, 1, 3, 4, 5, 2) + return x + + def forward(self, images, img_metas): + x = super().forward(images, img_metas) + x = self.downsample(x) + return x \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/geometry_kernel_attention.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/geometry_kernel_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..03ba0e7f8e6cbacc7d5cc6abbdde1d542b8e030c --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/geometry_kernel_attention.py @@ -0,0 +1,506 @@ +import warnings +import time +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import build_attention +import math +from mmcv.runner import force_fp32, auto_fp16 + +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from projects.mmdet3d_plugin.models.utils.bricks import run_time + +from .ops.geometric_kernel_attn import GeometricKernelAttentionFunc + +@ATTENTION.register_module() +class GeometrySptialCrossAttention(BaseModule): + """An attention module used in BEVFormer. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_cams (int): The number of cameras + dropout (float): A Dropout layer on `inp_residual`. + Default: 0.. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + deformable_attention: (dict): The config for the deformable attention used in SCA. + """ + + def __init__(self, + embed_dims=256, + num_cams=6, + pc_range=None, + dropout=0.1, + init_cfg=None, + batch_first=False, + attention=dict( + type='MSDeformableAttention3D', + embed_dims=256, + num_levels=4), + **kwargs + ): + super(GeometrySptialCrossAttention, self).__init__(init_cfg) + + self.init_cfg = init_cfg + self.dropout = nn.Dropout(dropout) + self.pc_range = pc_range + self.fp16_enabled = False + self.attention = build_attention(attention) + self.embed_dims = embed_dims + self.num_cams = num_cams + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.batch_first = batch_first + self.init_weight() + + def init_weight(self): + """Default initialization for Parameters of Module.""" + xavier_init(self.output_proj, distribution='uniform', bias=0.) + + @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) + def forward(self, + query, + key, + value, + residual=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + reference_points_cam=None, + bev_mask=None, + level_start_index=None, + flag='encoder', + **kwargs): + """Forward Function of Detr3DCrossAtten. + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. (B, N, C, H, W) + residual (Tensor): The tensor used for addition, with the + same shape as `x`. Default None. If None, `x` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, 4), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different level. With shape (num_levels, 2), + last dimension represent (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + + if residual is None: + inp_residual = query + slots = torch.zeros_like(query) + if query_pos is not None: + query = query + query_pos + + bs, num_query, _ = query.size() + + D = reference_points_cam.size(3) + indexes = [] + for i, mask_per_img in enumerate(bev_mask): + index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1) + indexes.append(index_query_per_img) + max_len = max([len(each) for each in indexes]) + + # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. + queries_rebatch = query.new_zeros( + [bs, self.num_cams, max_len, self.embed_dims]) + reference_points_rebatch = reference_points_cam.new_zeros( + [bs, self.num_cams, max_len, D, 2]) + + for j in range(bs): + for i, reference_points_per_img in enumerate(reference_points_cam): + index_query_per_img = indexes[i] + queries_rebatch[j, i, :len( + index_query_per_img)] = query[j, index_query_per_img] + reference_points_rebatch[j, i, :len( + index_query_per_img)] = reference_points_per_img[j, index_query_per_img] + + num_cams, l, bs, embed_dims = key.shape + + key = key.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + value = value.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + + queries = self.attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, + reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, + level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) + for j in range(bs): + for i, index_query_per_img in enumerate(indexes): + slots[j, index_query_per_img] += queries[j, + i, :len(index_query_per_img)] + + count = bev_mask.sum(-1) > 0 + count = count.permute(1, 2, 0).sum(-1) + count = torch.clamp(count, min=1.0) + slots = slots / count[..., None] + slots = self.output_proj(slots) + + return self.dropout(slots) + inp_residual + + +@ATTENTION.register_module() +class GeometryKernelAttention(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + kernel_size=(3, 3), + dilation=1, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.batch_first = batch_first + self.output_proj = None + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + # 4 + self.num_levels = num_levels + # 4 num_heads -> num_z_anchors + self.num_heads = num_heads + self.kernel_size = kernel_size + self.num_points = kernel_size[0] * kernel_size[1] + # self.sampling_offsets = nn.Linear( + # embed_dims, num_heads * num_levels * self.num_points * 2) + + self.attention_weights = nn.Linear( + embed_dims, num_levels * self.num_points * self.num_heads) + self.value_proj = nn.Linear(embed_dims, embed_dims) + + grid_h, grid_w = kernel_size + y = (torch.arange(grid_h) - grid_h // 2) * dilation + x = (torch.arange(grid_w) - grid_w // 2) * dilation + offsets = torch.stack( + torch.meshgrid(x, y)).permute(1, 2, 0).reshape(grid_h * grid_w, 2) + self.register_buffer("grid_offsets", offsets, persistent=False) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + # constant_init(self.sampling_offsets, 0.) + # thetas = torch.arange( + # self.num_heads, + # dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + # grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + # grid_init = (grid_init / + # grid_init.abs().max(-1, keepdim=True)[0]).view( + # self.num_heads, 1, 1, + # 2).repeat(1, self.num_levels, self.num_points, 1) + # for i in range(self.num_points): + # grid_init[:, :, i, :] *= i + 1 + + # self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward_kernel_multihead_attention(self, value, spatial_shapes, sampling_locations, attention_weights): + # value: (bs, n, d) + """CPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, dim) + spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_levels, num_points), + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + # print(value.shape, sampling_locations.shape, attention_weights.shape) + # print(value.shape) + bs, num_keys, num_heads, dim = value.shape + # (bs * num_heads * num_keys, d) + # torch.cuda.synchronize() + # start2 = time.perf_counter() + value = value.transpose(1, 2).contiguous().view( + bs * num_heads * num_keys, dim) + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + with torch.no_grad(): + sampling_index = sampling_locations.new_zeros( + (bs, num_queries, num_heads, num_levels, num_points)).to(value.device) + start_index = 0 + for level, (H_, W_) in enumerate(spatial_shapes): + # xy or yx? + sampling_locations[:, :, :, level, + :, 0].clamp_(min=0, max=W_-1) + sampling_locations[:, :, :, level, + :, 1].clamp_(min=0, max=H_-1) + sampling_index[:, :, :, level] = start_index + sampling_locations[:, :, :, level, :, 0] \ + + sampling_locations[:, :, :, level, :, 1] * W_ + start_index += H_ * W_ + # print(start_index) + # head index, (bs, head, num_quries,) + sampling_index = sampling_index.transpose( + 1, 2).reshape(bs, num_heads, -1) + sampling_index = sampling_index + \ + (torch.arange(num_heads).to(sampling_index) + * num_keys).view(1, num_heads, 1) + # batch index + sampling_index = sampling_index.reshape( + bs, -1) + (torch.arange(bs).to(sampling_index) * num_keys * num_heads).view(bs, 1) + # torch.cuda.synchronize() + # end = time.perf_counter() + # print("geometric kernel attention (index): {:.3f} ms".format( + # (end-start)*1000)) + # torch.cuda.synchronize() + # start = time.perf_counter() + sampling_value = value[sampling_index].view( + bs, num_heads, num_queries, num_levels * num_points, dim) + # print(sampling_value.shape) + attention_weights = attention_weights.transpose(1, 2).contiguous().view( + bs, num_heads, num_queries, num_levels * num_points, 1) + # torch.cuda.synchronize() + # end = time.perf_counter() + # print("geometric kernel attention (sample): {:.3f} ms".format( + # (end-start)*1000)) + # # (bs*head, num_queries, num_levels * num_points, d) -> (bs, head, num_queries, d) + # torch.cuda.synchronize() + # start = time.perf_counter() + output = (sampling_value * + attention_weights).sum(-2).transpose(1, 2).contiguous() + # torch.cuda.synchronize() + # end = time.perf_counter() + # print("geometric kernel attention (matmul): {:.3f} ms".format( + # (end-start)*1000)) + # print('x;', output.shape) + return output.view(bs, num_queries, -1) + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + Args: + query (Tensor): Query of Transformer with shape + ( bs, num_query, embed_dims). + key (Tensor): The key tensor with shape + `(bs, num_key, embed_dims)`. + value (Tensor): The value tensor with shape + `(bs, num_key, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + # sampling_offsets = self.sampling_offsets(query).view( + # bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + + # bs, num_query, num_heads, num_levels, num_points + # bs, q, 4, 4, K^2 + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + + if reference_points.shape[-1] == 2: + """ + For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. + After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. + For each referent point, we sample `num_points` sampling points. + For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. + """ + with torch.no_grad(): + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + + bs, num_query, num_Z_anchors, xy = reference_points.shape + # from IPython import embed; embed() + # (K,2) -> (1, 1, 1, 1, k, 2) -> (bs, q, nz, l, k, 2) + offsets = self.grid_offsets[None, None, None, None] + # (bs, q, nz, 1, xy) -> (bs, q, z, l, 2) + reference_points = reference_points[:, + :, :, None, :] * offset_normalizer + + # from IPython import embed;embed() + # (bs, q, nz, l, k, xy) + sampling_locations = ( + reference_points[:, :, :, :, None, :] + offsets).round().long() + + # sampling_offsets = sampling_offsets / \ + # offset_normalizer[None, None, None, :, None, :] + # (bs, q, 4(z), 4, K^2, 2) + bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_locations.shape + # sampling_offsets = sampling_offsets.view( + # bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) + # sampling_locations = reference_points + sampling_offsets + # bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape + # assert num_all_points == num_points * num_Z_anchors + + # sampling_locations = sampling_locations.view( + # bs, num_query, num_heads, num_levels, num_all_points, xy) + + elif reference_points.shape[-1] == 4: + assert False + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + + # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 + # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points + # import pdb;pdb.set_trace() + # output = self.forward_kernel_multihead_attention( + # value, spatial_shapes, sampling_locations, attention_weights) + # torch.cuda.synchronize() + # start = time.perf_counter() + output = GeometricKernelAttentionFunc.apply( + value, spatial_shapes, level_start_index, sampling_locations.contiguous(), attention_weights, self.im2col_step + ) + # if torch.cuda.is_available() and value.is_cuda: + # if value.dtype == torch.float16: + # MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + # else: + # MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + # output = MultiScaleDeformableAttnFunction.apply( + # value, spatial_shapes, level_start_index, sampling_locations, + # attention_weights, self.im2col_step) + # else: + # output = multi_scale_deformable_attn_pytorch( + # value, spatial_shapes, sampling_locations, attention_weights) + if not self.batch_first: + output = output.permute(1, 0, 2) + # torch.cuda.synchronize() + # end = time.perf_counter() + # print("geometric kernel attention: {:.3f} ms".format((end-start)*1000)) + return output diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7578beb75c41d3a145992e9ef3663c88fd556d6d --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/__init__.py @@ -0,0 +1 @@ +from .function import GeometricKernelAttentionFunc \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ee42aa17f6d5aa51304f91557d2c6a39c28a5ea --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/__init__.py @@ -0,0 +1 @@ +from .geometric_kernel_attn_func import GeometricKernelAttentionFunc diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/geometric_kernel_attn_func.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/geometric_kernel_attn_func.py new file mode 100644 index 0000000000000000000000000000000000000000..6c44a7e3108a18fa997095c03b000a770e5ba770 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/geometric_kernel_attn_func.py @@ -0,0 +1,31 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import torch +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +import GeometricKernelAttention as GKA + + +class GeometricKernelAttentionFunc(Function): + @staticmethod + def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): + ctx.im2col_step = im2col_step + output = GKA.geometric_kernel_attn_cuda_forward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, attention_weights) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors + grad_value, grad_attn_weight = \ + GKA.geometric_kernel_attn_cuda_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) + + return grad_value, None, None, None, grad_attn_weight, None diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/setup.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..6a5891640b18f8c5ea50f266ca6e63f1b0d1f99c --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/setup.py @@ -0,0 +1,65 @@ +import os +import glob + +import torch + +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +from setuptools import find_packages +from setuptools import setup + +requirements = ["torch", "torchvision"] + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "src") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + # source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu")) + + sources = main_file + extension = CppExtension + extra_compile_args = {"cxx": []} + define_macros = [] + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + raise NotImplementedError('Cuda is not availabel') + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + ext_modules = [ + extension( + "GeometricKernelAttention", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + return ext_modules + + +setup( + name="GeometricKernelAttention", + version="1.0", + author="Tianheng Cheng", + url="https://github.com/hustvl", + description="PyTorch Wrapper for CUDA Functions of Multi-Scale Geometric Kernel Attention", + packages=find_packages(exclude=("configs", "tests",)), + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/src/version.cpp b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/src/version.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c53ff56a9654d23aecd1761546ce4f823bfea21d --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/src/version.cpp @@ -0,0 +1,7 @@ +#include "geometric_kernel_attn.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("geometric_kernel_attn_cuda_forward", &geometric_kernel_attn_cuda_forward, "geometric_kernel_attn_cuda_forward"); + m.def("geometric_kernel_attn_cuda_backward", &geometric_kernel_attn_cuda_backward, "geometric_kernel_attn_cuda_backward"); +} diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/test.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/test.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/transformer.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..6dc31c7fb4ed65113483b20a2662d2a2382bc8c8 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/transformer.py @@ -0,0 +1,355 @@ +import copy +import torch +import torch.nn as nn +import numpy as np +from torch.nn.init import normal_ +import torch.nn.functional as F +from mmdet.models.utils.builder import TRANSFORMER +from mmcv.cnn import Linear, bias_init_with_prob, xavier_init, constant_init +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence +from torchvision.transforms.functional import rotate +from projects.mmdet3d_plugin.bevformer.modules.temporal_self_attention import TemporalSelfAttention +from projects.mmdet3d_plugin.bevformer.modules.spatial_cross_attention import MSDeformableAttention3D +from projects.mmdet3d_plugin.bevformer.modules.decoder import CustomMSDeformableAttention +from .builder import build_fuser, FUSERS +from typing import List + +@FUSERS.register_module() +class ConvFuser(nn.Sequential): + def __init__(self, in_channels: int, out_channels: int) -> None: + self.in_channels = in_channels + self.out_channels = out_channels + super().__init__( + nn.Conv2d(sum(in_channels), out_channels, 3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(True), + ) + + def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor: + return super().forward(torch.cat(inputs, dim=1)) + + + +@TRANSFORMER.register_module() +class MapTRPerceptionTransformer(BaseModule): + """Implements the Detr3D transformer. + Args: + as_two_stage (bool): Generate query from encoder features. + Default: False. + num_feature_levels (int): Number of feature maps from FPN: + Default: 4. + two_stage_num_proposals (int): Number of proposals when set + `as_two_stage` as True. Default: 300. + """ + + def __init__(self, + num_feature_levels=4, + num_cams=6, + two_stage_num_proposals=300, + fuser=None, + encoder=None, + decoder=None, + embed_dims=256, + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + len_can_bus=18, + can_bus_norm=True, + use_cams_embeds=True, + rotate_center=[100, 100], + modality='vision', + **kwargs): + super(MapTRPerceptionTransformer, self).__init__(**kwargs) + if modality == 'fusion': + self.fuser = build_fuser(fuser) #TODO + self.use_attn_bev = encoder['type'] == 'BEVFormerEncoder' + self.encoder = build_transformer_layer_sequence(encoder) + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = embed_dims + self.num_feature_levels = num_feature_levels + self.num_cams = num_cams + self.fp16_enabled = False + + self.rotate_prev_bev = rotate_prev_bev + self.use_shift = use_shift + self.use_can_bus = use_can_bus + self.len_can_bus = len_can_bus + self.can_bus_norm = can_bus_norm + self.use_cams_embeds = use_cams_embeds + + self.two_stage_num_proposals = two_stage_num_proposals + self.init_layers() + self.rotate_center = rotate_center + + def init_layers(self): + """Initialize layers of the Detr3DTransformer.""" + self.level_embeds = nn.Parameter(torch.Tensor( + self.num_feature_levels, self.embed_dims)) + self.cams_embeds = nn.Parameter( + torch.Tensor(self.num_cams, self.embed_dims)) + self.reference_points = nn.Linear(self.embed_dims, 2) # TODO, this is a hack + self.can_bus_mlp = nn.Sequential( + nn.Linear(self.len_can_bus, self.embed_dims // 2), + nn.ReLU(inplace=True), + nn.Linear(self.embed_dims // 2, self.embed_dims), + nn.ReLU(inplace=True), + ) + if self.can_bus_norm: + self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims)) + + def init_weights(self): + """Initialize the transformer weights.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \ + or isinstance(m, CustomMSDeformableAttention): + try: + m.init_weight() + except AttributeError: + m.init_weights() + normal_(self.level_embeds) + normal_(self.cams_embeds) + xavier_init(self.reference_points, distribution='uniform', bias=0.) + xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.) + # TODO apply fp16 to this module cause grad_norm NAN + # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'), out_fp32=True) + def attn_bev_encode( + self, + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=(0.512, 0.512), + bev_pos=None, + prev_bev=None, + **kwargs): + bs = mlvl_feats[0].size(0) + bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) + bev_pos = bev_pos.flatten(2).permute(2, 0, 1) + + # obtain rotation angle and shift with ego motion + delta_x = np.array([each['can_bus'][0] + for each in kwargs['img_metas']]) + delta_y = np.array([each['can_bus'][1] + for each in kwargs['img_metas']]) + ego_angle = np.array( + [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']]) + grid_length_y = grid_length[0] + grid_length_x = grid_length[1] + translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2) + translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180 + bev_angle = ego_angle - translation_angle + shift_y = translation_length * \ + np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h + shift_x = translation_length * \ + np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w + shift_y = shift_y * self.use_shift + shift_x = shift_x * self.use_shift + shift = bev_queries.new_tensor( + [shift_x, shift_y]).permute(1, 0) # xy, bs -> bs, xy + + if prev_bev is not None: + if prev_bev.shape[1] == bev_h * bev_w: + prev_bev = prev_bev.permute(1, 0, 2) + if self.rotate_prev_bev: + for i in range(bs): + # num_prev_bev = prev_bev.size(1) + rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + tmp_prev_bev = prev_bev[:, i].reshape( + bev_h, bev_w, -1).permute(2, 0, 1) + tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, + center=self.rotate_center) + tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( + bev_h * bev_w, 1, -1) + prev_bev[:, i] = tmp_prev_bev[:, 0] + + # add can bus signals + can_bus = bev_queries.new_tensor( + [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] + can_bus = self.can_bus_mlp(can_bus[:, :self.len_can_bus])[None, :, :] + bev_queries = bev_queries + can_bus * self.use_can_bus + + feat_flatten = [] + spatial_shapes = [] + for lvl, feat in enumerate(mlvl_feats): + bs, num_cam, c, h, w = feat.shape + spatial_shape = (h, w) + feat = feat.flatten(3).permute(1, 0, 3, 2) + if self.use_cams_embeds: + feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) + feat = feat + self.level_embeds[None, + None, lvl:lvl + 1, :].to(feat.dtype) + spatial_shapes.append(spatial_shape) + feat_flatten.append(feat) + + feat_flatten = torch.cat(feat_flatten, 2) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=bev_pos.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + feat_flatten = feat_flatten.permute( + 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) + + bev_embed = self.encoder( + bev_queries, + feat_flatten, + feat_flatten, + bev_h=bev_h, + bev_w=bev_w, + bev_pos=bev_pos, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + prev_bev=prev_bev, + shift=shift, + **kwargs + ) + return bev_embed + + def lss_bev_encode( + self, + mlvl_feats, + prev_bev=None, + **kwargs): + assert len(mlvl_feats) == 1, 'Currently we only support single level feat in LSS' + images = mlvl_feats[0] + img_metas = kwargs['img_metas'] + bev_embed = self.encoder(images,img_metas) + bs, c, _,_ = bev_embed.shape + bev_embed = bev_embed.view(bs,c,-1).permute(0,2,1).contiguous() + + return bev_embed + + def get_bev_features( + self, + mlvl_feats, + lidar_feat, + bev_queries, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + prev_bev=None, + **kwargs): + """ + obtain bev features. + """ + if self.use_attn_bev: + bev_embed = self.attn_bev_encode( + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=grid_length, + bev_pos=bev_pos, + prev_bev=prev_bev, + **kwargs) + else: + bev_embed = self.lss_bev_encode( + mlvl_feats, + prev_bev=prev_bev, + **kwargs) + if lidar_feat is not None: + bs = mlvl_feats[0].size(0) + bev_embed = bev_embed.view(bs, bev_h, bev_w, -1).permute(0,3,1,2).contiguous() + lidar_feat = lidar_feat.permute(0,1,3,2).contiguous() # B C H W + lidar_feat = nn.functional.interpolate(lidar_feat, size=(bev_h,bev_w), mode='bicubic', align_corners=False) + fused_bev = self.fuser([bev_embed, lidar_feat]) + fused_bev = fused_bev.flatten(2).permute(0,2,1).contiguous() + bev_embed = fused_bev + + return bev_embed + # TODO apply fp16 to this module cause grad_norm NAN + # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) + def forward(self, + mlvl_feats, + lidar_feat, + bev_queries, + object_query_embed, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + reg_branches=None, + cls_branches=None, + prev_bev=None, + **kwargs): + """Forward function for `Detr3DTransformer`. + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, num_cams, embed_dims, h, w]. + bev_queries (Tensor): (bev_h*bev_w, c) + bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) + object_query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when `with_box_refine` is True. Default to None. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - bev_embed: BEV features + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + + bev_embed = self.get_bev_features( + mlvl_feats, + lidar_feat, + bev_queries, + bev_h, + bev_w, + grid_length=grid_length, + bev_pos=bev_pos, + prev_bev=prev_bev, + **kwargs) # bev_embed shape: bs, bev_h*bev_w, embed_dims + + bs = mlvl_feats[0].size(0) + query_pos, query = torch.split( + object_query_embed, self.embed_dims, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos) + reference_points = reference_points.sigmoid() + init_reference_out = reference_points + + query = query.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + bev_embed = bev_embed.permute(1, 0, 2) + + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=bev_embed, + query_pos=query_pos, + reference_points=reference_points, + reg_branches=reg_branches, + cls_branches=cls_branches, + spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + + inter_references_out = inter_references + + return bev_embed, inter_states, init_reference_out, inter_references_out diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4a9169d02a0cfb21e91c61431109b4d9ae304f1d --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/__init__.py @@ -0,0 +1,4 @@ +from .vovnet import VoVNet +from .efficientnet import EfficientNet +from .swin import SwinTransformer +__all__ = ['VoVNet'] \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/efficientnet.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/efficientnet.py new file mode 100644 index 0000000000000000000000000000000000000000..82556ece1cd064f8bb65ffa2cb8ca4902eb6a108 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/efficientnet.py @@ -0,0 +1,415 @@ +import copy +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn.bricks import ConvModule, DropPath +from mmcv.runner import BaseModule, Sequential + +from mmdet.models.builder import BACKBONES +from ..utils import InvertedResidual, SELayer, make_divisible + + +class EdgeResidual(BaseModule): + """Edge Residual Block. + Args: + in_channels (int): The input channels of this module. + out_channels (int): The output channels of this module. + mid_channels (int): The input channels of the second convolution. + kernel_size (int): The kernel size of the first convolution. + Defaults to 3. + stride (int): The stride of the first convolution. Defaults to 1. + se_cfg (dict, optional): Config dict for se layer. Defaults to None, + which means no se layer. + with_residual (bool): Use residual connection. Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to ``dict(type='BN')``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU')``. + drop_path_rate (float): stochastic depth rate. Defaults to 0. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + init_cfg (dict | list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_residual=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + drop_path_rate=0., + with_cp=False, + init_cfg=None, + **kwargs): + super(EdgeResidual, self).__init__(init_cfg=init_cfg) + assert stride in [1, 2] + self.with_cp = with_cp + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.with_se = se_cfg is not None + self.with_residual = ( + stride == 1 and in_channels == out_channels and with_residual) + + if self.with_se: + assert isinstance(se_cfg, dict) + + self.conv1 = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if self.with_se: + self.se = SELayer(**se_cfg) + + self.conv2 = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + out = self.conv1(out) + + if self.with_se: + out = self.se(out) + + out = self.conv2(out) + + if self.with_residual: + return x + self.drop_path(out) + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +def model_scaling(layer_setting, arch_setting): + """Scaling operation to the layer's parameters according to the + arch_setting.""" + # scale width + new_layer_setting = copy.deepcopy(layer_setting) + for layer_cfg in new_layer_setting: + for block_cfg in layer_cfg: + block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8) + + # scale depth + split_layer_setting = [new_layer_setting[0]] + for layer_cfg in new_layer_setting[1:-1]: + tmp_index = [0] + for i in range(len(layer_cfg) - 1): + if layer_cfg[i + 1][1] != layer_cfg[i][1]: + tmp_index.append(i + 1) + tmp_index.append(len(layer_cfg)) + for i in range(len(tmp_index) - 1): + split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i + + 1]]) + split_layer_setting.append(new_layer_setting[-1]) + + num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]] + new_layers = [ + int(math.ceil(arch_setting[1] * num)) for num in num_of_layers + ] + + merge_layer_setting = [split_layer_setting[0]] + for i, layer_cfg in enumerate(split_layer_setting[1:-1]): + if new_layers[i] <= num_of_layers[i]: + tmp_layer_cfg = layer_cfg[:new_layers[i]] + else: + tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * ( + new_layers[i] - num_of_layers[i]) + if tmp_layer_cfg[0][3] == 1 and i != 0: + merge_layer_setting[-1] += tmp_layer_cfg.copy() + else: + merge_layer_setting.append(tmp_layer_cfg.copy()) + merge_layer_setting.append(split_layer_setting[-1]) + + return merge_layer_setting + + +@BACKBONES.register_module() +class EfficientNet(BaseModule): + """EfficientNet backbone. + Args: + arch (str): Architecture of efficientnet. Defaults to b0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (6, ). + frozen_stages (int): Stages to be frozen (all param fixed). + Defaults to 0, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + """ + + # Parameters to build layers. + # 'b' represents the architecture of normal EfficientNet family includes + # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'. + # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es', + # 'em', 'el'. + # 6 parameters are needed to construct a layer, From left to right: + # - kernel_size: The kernel size of the block + # - out_channel: The number of out_channels of the block + # - se_ratio: The sequeeze ratio of SELayer. + # - stride: The stride of the block + # - expand_ratio: The expand_ratio of the mid_channels + # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual + layer_settings = { + 'b': [[[3, 32, 0, 2, 0, -1]], + [[3, 16, 4, 1, 1, 0]], + [[3, 24, 4, 2, 6, 0], + [3, 24, 4, 1, 6, 0]], + [[5, 40, 4, 2, 6, 0], + [5, 40, 4, 1, 6, 0]], + [[3, 80, 4, 2, 6, 0], + [3, 80, 4, 1, 6, 0], + [3, 80, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0]], + [[5, 192, 4, 2, 6, 0], + [5, 192, 4, 1, 6, 0], + [5, 192, 4, 1, 6, 0], + [5, 192, 4, 1, 6, 0], + [3, 320, 4, 1, 6, 0]], + [[1, 1280, 0, 1, 0, -1]] + ], + 'e': [[[3, 32, 0, 2, 0, -1]], + [[3, 24, 0, 1, 3, 1]], + [[3, 32, 0, 2, 8, 1], + [3, 32, 0, 1, 8, 1]], + [[3, 48, 0, 2, 8, 1], + [3, 48, 0, 1, 8, 1], + [3, 48, 0, 1, 8, 1], + [3, 48, 0, 1, 8, 1]], + [[5, 96, 0, 2, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0]], + [[5, 192, 0, 2, 8, 0], + [5, 192, 0, 1, 8, 0]], + [[1, 1280, 0, 1, 0, -1]] + ] + } # yapf: disable + + # Parameters to build different kinds of architecture. + # From left to right: scaling factor for width, scaling factor for depth, + # resolution. + arch_settings = { + 'b0': (1.0, 1.0, 224), + 'b1': (1.0, 1.1, 240), + 'b2': (1.1, 1.2, 260), + 'b3': (1.2, 1.4, 300), + 'b4': (1.4, 1.8, 380), + 'b5': (1.6, 2.2, 456), + 'b6': (1.8, 2.6, 528), + 'b7': (2.0, 3.1, 600), + 'b8': (2.2, 3.6, 672), + 'es': (1.0, 1.0, 224), + 'em': (1.0, 1.1, 240), + 'el': (1.2, 1.4, 300) + } + + def __init__(self, + arch='b0', + drop_path_rate=0., + out_indices=(6, ), + frozen_stages=0, + conv_cfg=dict(type='Conv2dAdaptivePadding'), + norm_cfg=dict(type='BN', eps=1e-3), + act_cfg=dict(type='Swish'), + norm_eval=False, + with_cp=False, + init_cfg=[ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + layer=['_BatchNorm', 'GroupNorm'], + val=1) + ]): + super(EfficientNet, self).__init__(init_cfg) + assert arch in self.arch_settings, \ + f'"{arch}" is not one of the arch_settings ' \ + f'({", ".join(self.arch_settings.keys())})' + self.arch_setting = self.arch_settings[arch] + self.layer_setting = self.layer_settings[arch[:1]] + for index in out_indices: + if index not in range(0, len(self.layer_setting)): + raise ValueError('the item in out_indices must in ' + f'range(0, {len(self.layer_setting)}). ' + f'But received {index}') + + if frozen_stages not in range(len(self.layer_setting) + 1): + raise ValueError('frozen_stages must be in range(0, ' + f'{len(self.layer_setting) + 1}). ' + f'But received {frozen_stages}') + self.drop_path_rate = drop_path_rate + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.layer_setting = model_scaling(self.layer_setting, + self.arch_setting) + block_cfg_0 = self.layer_setting[0][0] + block_cfg_last = self.layer_setting[-1][0] + self.in_channels = make_divisible(block_cfg_0[1], 8) + self.out_channels = block_cfg_last[1] + self.layers = nn.ModuleList() + self.layers.append( + ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=block_cfg_0[0], + stride=block_cfg_0[3], + padding=block_cfg_0[0] // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.make_layer() + # Avoid building unused layers in mmdetection. + if len(self.layers) < max(self.out_indices) + 1: + self.layers.append( + ConvModule( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=block_cfg_last[0], + stride=block_cfg_last[3], + padding=block_cfg_last[0] // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def make_layer(self): + # Without the first and the final conv block. + layer_setting = self.layer_setting[1:-1] + + total_num_blocks = sum([len(x) for x in layer_setting]) + block_idx = 0 + dpr = [ + x.item() + for x in torch.linspace(0, self.drop_path_rate, total_num_blocks) + ] # stochastic depth decay rule + + for i, layer_cfg in enumerate(layer_setting): + # Avoid building unused layers in mmdetection. + if i > max(self.out_indices) - 1: + break + layer = [] + for i, block_cfg in enumerate(layer_cfg): + (kernel_size, out_channels, se_ratio, stride, expand_ratio, + block_type) = block_cfg + + mid_channels = int(self.in_channels * expand_ratio) + out_channels = make_divisible(out_channels, 8) + if se_ratio <= 0: + se_cfg = None + else: + # In mmdetection, the `divisor` is deleted to align + # the logic of SELayer with mmcls. + se_cfg = dict( + channels=mid_channels, + ratio=expand_ratio * se_ratio, + act_cfg=(self.act_cfg, dict(type='Sigmoid'))) + if block_type == 1: # edge tpu + if i > 0 and expand_ratio == 3: + with_residual = False + expand_ratio = 4 + else: + with_residual = True + mid_channels = int(self.in_channels * expand_ratio) + if se_cfg is not None: + # In mmdetection, the `divisor` is deleted to align + # the logic of SELayer with mmcls. + se_cfg = dict( + channels=mid_channels, + ratio=se_ratio * expand_ratio, + act_cfg=(self.act_cfg, dict(type='Sigmoid'))) + block = partial(EdgeResidual, with_residual=with_residual) + else: + block = InvertedResidual + layer.append( + block( + in_channels=self.in_channels, + out_channels=out_channels, + mid_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + se_cfg=se_cfg, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + drop_path_rate=dpr[block_idx], + with_cp=self.with_cp, + # In mmdetection, `with_expand_conv` is set to align + # the logic of InvertedResidual with mmcls. + with_expand_conv=(mid_channels != self.in_channels))) + self.in_channels = out_channels + block_idx += 1 + self.layers.append(Sequential(*layer)) + + def forward(self, x): + outs = [] + # import pdb;pdb.set_trace() + for i, layer in enumerate(self.layers): + x = layer(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) + + def _freeze_stages(self): + for i in range(self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(EfficientNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/swin.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/swin.py new file mode 100644 index 0000000000000000000000000000000000000000..166950599a35a5ea92cfb1f3aeaacea527929e0f --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/swin.py @@ -0,0 +1,825 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from copy import deepcopy + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_norm_layer, trunc_normal_init +from mmcv.cnn.bricks.transformer import FFN, build_dropout +from mmcv.cnn.utils.weight_init import constant_init +from mmcv.runner import _load_checkpoint +from mmcv.runner.base_module import BaseModule, ModuleList +from torch.nn.modules.linear import Linear +from torch.nn.modules.normalization import LayerNorm +from torch.nn.modules.utils import _pair as to_2tuple +import torch.utils.checkpoint as checkpoint + +from mmseg.ops import resize +from mmdet3d.utils import get_root_logger +from mmdet.models.builder import BACKBONES +from mmcv.cnn.bricks.registry import ATTENTION +from ..utils import PatchEmbed, swin_convert + + +class PatchMerging(BaseModule): + """Merge patch feature map. + + This layer use nn.Unfold to group feature map by kernel_size, and use norm + and linear layer to embed grouped feature map. + Args: + in_channels (int): The num of input channels. + out_channels (int): The num of output channels. + stride (int | tuple): the stride of the sliding length in the + unfold layer. Defaults: 2. (Default to be equal with kernel_size). + bias (bool, optional): Whether to add bias in linear layer or not. + Defaults: False. + norm_cfg (dict, optional): Config dict for normalization layer. + Defaults: dict(type='LN'). + init_cfg (dict, optional): The extra config for initialization. + Defaults: None. + """ + + def __init__(self, + in_channels, + out_channels, + stride=2, + bias=False, + norm_cfg=dict(type='LN'), + init_cfg=None): + super().__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + + self.sampler = nn.Unfold( + kernel_size=stride, dilation=1, padding=0, stride=stride) + + sample_dim = stride**2 * in_channels + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, sample_dim)[1] + else: + self.norm = None + + self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) + + def forward(self, x, hw_shape): + """ + x: x.shape -> [B, H*W, C] + hw_shape: (H, W) + """ + B, L, C = x.shape + H, W = hw_shape + assert L == H * W, 'input feature has wrong size' + + x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W + + # stride is fixed to be equal to kernel_size. + if (H % self.stride != 0) or (W % self.stride != 0): + x = F.pad(x, (0, W % self.stride, 0, H % self.stride)) + + # Use nn.Unfold to merge patch. About 25% faster than original method, + # but need to modify pretrained model for compatibility + x = self.sampler(x) # B, 4*C, H/2*W/2 + x = x.transpose(1, 2) # B, H/2*W/2, 4*C + + x = self.norm(x) if self.norm else x + x = self.reduction(x) + + down_hw_shape = (H + 1) // 2, (W + 1) // 2 + return x, down_hw_shape + + +@ATTENTION.register_module() +class WindowMSA(BaseModule): + """Window based multi-head self-attention (W-MSA) module with relative + position bias. + + Args: + embed_dims (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0.0 + proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.0 + init_cfg (dict | None, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0., + proj_drop_rate=0., + init_cfg=None): + + super().__init__() + self.embed_dims = embed_dims + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_embed_dims = embed_dims // num_heads + self.scale = qk_scale or head_embed_dims**-0.5 + self.init_cfg = init_cfg + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), + num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # About 2x faster than original impl + Wh, Ww = self.window_size + rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) + rel_position_index = rel_index_coords + rel_index_coords.T + rel_position_index = rel_position_index.flip(1).contiguous() + self.register_buffer('relative_position_index', rel_position_index) + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop_rate) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(proj_drop_rate) + + self.softmax = nn.Softmax(dim=-1) + + def init_weights(self): + trunc_normal_init(self.relative_position_bias_table, std=0.02) + + def forward(self, x, mask=None): + """ + Args: + + x (tensor): input features with shape of (num_windows*B, N, C) + mask (tensor | None, Optional): mask with shape of (num_windows, + Wh*Ww, Wh*Ww), value should be between (-inf, 0]. + """ + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[ + 2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + @staticmethod + def double_step_seq(step1, len1, step2, len2): + seq1 = torch.arange(0, step1 * len1, step1) + seq2 = torch.arange(0, step2 * len2, step2) + return (seq1[:, None] + seq2[None, :]).reshape(1, -1) + + +@ATTENTION.register_module() +class ShiftWindowMSA(BaseModule): + """Shift Window Multihead Self-Attention Module. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): The height and width of the window. + shift_size (int, optional): The shift step of each window towards + right-bottom. If zero, act as regular window-msa. Defaults to 0. + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: True + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Defaults: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Defaults: 0. + proj_drop_rate (float, optional): Dropout ratio of output. + Defaults: 0. + dropout_layer (dict, optional): The dropout_layer used before output. + Defaults: dict(type='DropPath', drop_prob=0.). + init_cfg (dict, optional): The extra config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size, + shift_size=0, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0, + proj_drop_rate=0, + dropout_layer=dict(type='DropPath', drop_prob=0.), + init_cfg=None): + super().__init__(init_cfg) + + self.window_size = window_size + self.shift_size = shift_size + assert 0 <= self.shift_size < self.window_size + + self.w_msa = WindowMSA( + embed_dims=embed_dims, + num_heads=num_heads, + window_size=to_2tuple(window_size), + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop_rate=attn_drop_rate, + proj_drop_rate=proj_drop_rate, + init_cfg=None) + + self.drop = build_dropout(dropout_layer) + + def forward(self, query, hw_shape): + B, L, C = query.shape + H, W = hw_shape + assert L == H * W, 'input feature has wrong size' + query = query.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b)) + H_pad, W_pad = query.shape[1], query.shape[2] + + # cyclic shift + if self.shift_size > 0: + shifted_query = torch.roll( + query, + shifts=(-self.shift_size, -self.shift_size), + dims=(1, 2)) + + # calculate attention mask for SW-MSA + img_mask = torch.zeros((1, H_pad, W_pad, 1), + device=query.device) # 1 H W 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + # w_slices = (slice(0, -self.window_size), + # slice(-self.window_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + # nW, window_size, window_size, 1 + mask_windows = self.window_partition(img_mask) + mask_windows = mask_windows.view( + -1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, + float(-100.0)).masked_fill( + attn_mask == 0, float(0.0)) + else: + shifted_query = query + attn_mask = None + + # nW*B, window_size, window_size, C + query_windows = self.window_partition(shifted_query) + # nW*B, window_size*window_size, C + query_windows = query_windows.view(-1, self.window_size**2, C) + + # W-MSA/SW-MSA (nW*B, window_size*window_size, C) + attn_windows = self.w_msa(query_windows, mask=attn_mask) + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, + self.window_size, C) + + # B H' W' C + shifted_x = self.window_reverse(attn_windows, H_pad, W_pad) + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, + shifts=(self.shift_size, self.shift_size), + dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + x = self.drop(x) + return x + + def window_reverse(self, windows, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + window_size = self.window_size + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, + window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + def window_partition(self, x): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + window_size = self.window_size + x = x.view(B, H // window_size, window_size, W // window_size, + window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous() + windows = windows.view(-1, window_size, window_size, C) + return windows + + +class SwinBlock(BaseModule): + """" + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + window size (int, optional): The local window scale. Default: 7. + shift (bool): whether to shift window or not. Default False. + qkv_bias (int, optional): enable bias for qkv if True. Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + drop_rate (float, optional): Dropout rate. Default: 0. + attn_drop_rate (float, optional): Attention dropout rate. Default: 0. + drop_path_rate (float, optional): Stochastic depth rate. Default: 0.2. + act_cfg (dict, optional): The config dict of activation function. + Default: dict(type='GELU'). + norm_cfg (dict, optional): The config dict of nomalization. + Default: dict(type='LN'). + init_cfg (dict | list | None, optional): The init config. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + window_size=7, + shift=False, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + init_cfg=None): + + super(SwinBlock, self).__init__() + + self.init_cfg = init_cfg + + self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] + self.attn = ShiftWindowMSA( + embed_dims=embed_dims, + num_heads=num_heads, + window_size=window_size, + shift_size=window_size // 2 if shift else 0, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop_rate=attn_drop_rate, + proj_drop_rate=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + init_cfg=None) + + self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] + self.ffn = FFN( + embed_dims=embed_dims, + feedforward_channels=feedforward_channels, + num_fcs=2, + ffn_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + act_cfg=act_cfg, + add_identity=True, + init_cfg=None) + self.hw_shape = None + + def forward(self, x): + hw_shape = self.hw_shape + identity = x + x = self.norm1(x) + x = self.attn(x, hw_shape) + + x = x + identity + + identity = x + x = self.norm2(x) + x = self.ffn(x, identity=identity) + + return x + + +class SwinBlockSequence(BaseModule): + """Implements one stage in Swin Transformer. + + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + depth (int): The number of blocks in this stage. + window size (int): The local window scale. Default: 7. + qkv_bias (int): enable bias for qkv if True. Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + drop_rate (float, optional): Dropout rate. Default: 0. + attn_drop_rate (float, optional): Attention dropout rate. Default: 0. + drop_path_rate (float, optional): Stochastic depth rate. Default: 0.2. + downsample (BaseModule | None, optional): The downsample operation + module. Default: None. + act_cfg (dict, optional): The config dict of activation function. + Default: dict(type='GELU'). + norm_cfg (dict, optional): The config dict of nomalization. + Default: dict(type='LN'). + init_cfg (dict | list | None, optional): The init config. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + depth, + window_size=7, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + downsample=None, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + init_cfg=None, + with_cp=True): + super().__init__() + + self.init_cfg = init_cfg + + drop_path_rate = drop_path_rate if isinstance( + drop_path_rate, + list) else [deepcopy(drop_path_rate) for _ in range(depth)] + + self.blocks = ModuleList() + for i in range(depth): + block = SwinBlock( + embed_dims=embed_dims, + num_heads=num_heads, + feedforward_channels=feedforward_channels, + window_size=window_size, + shift=False if i % 2 == 0 else True, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate[i], + act_cfg=act_cfg, + norm_cfg=norm_cfg, + init_cfg=None) + self.blocks.append(block) + + self.downsample = downsample + self.with_cp = with_cp + + def forward(self, x, hw_shape): + for block in self.blocks: + block.hw_shape=hw_shape + if self.with_cp: + x = checkpoint.checkpoint(block, x) + else: + x = block(x) + + if self.downsample: + x_down, down_hw_shape = self.downsample(x, hw_shape) + return x_down, down_hw_shape, x, hw_shape + else: + return x, hw_shape, x, hw_shape + + +@BACKBONES.register_module(force=True) +class SwinTransformer(BaseModule): + """ Swin Transformer + A PyTorch implement of : `Swin Transformer: + Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/abs/2103.14030 + + Inspiration from + https://github.com/microsoft/Swin-Transformer + + Args: + pretrain_img_size (int | tuple[int]): The size of input image when + pretrain. Defaults: 224. + in_channels (int): The num of input channels. + Defaults: 3. + embed_dims (int): The feature dimension. Default: 96. + patch_size (int | tuple[int]): Patch size. Default: 4. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + Default: 4. + depths (tuple[int]): Depths of each Swin Transformer stage. + Default: (2, 2, 6, 2). + num_heads (tuple[int]): Parallel attention heads of each Swin + Transformer stage. Default: (3, 6, 12, 24). + strides (tuple[int]): The patch merging or patch embedding stride of + each Swin Transformer stage. (In swin, we set kernel size equal to + stride.) Default: (4, 2, 2, 2). + out_indices (tuple[int]): Output from which stages. + Default: (0, 1, 2, 3). + qkv_bias (bool, optional): If True, add a learnable bias to query, key, + value. Default: True + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + patch_norm (bool): If add a norm layer for patch embed and patch + merging. Default: True. + drop_rate (float): Dropout rate. Defaults: 0. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. + use_abs_pos_embed (bool): If True, add absolute position embedding to + the patch embedding. Defaults: False. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='LN'). + norm_cfg (dict): Config dict for normalization layer at + output of backone. Defaults: dict(type='LN'). + pretrain_style (str): Choose to use official or mmcls pretrain weights. + Default: official. + pretrained (str, optional): model pretrained path. Default: None. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + pretrain_img_size=224, + in_channels=3, + embed_dims=96, + patch_size=4, + window_size=7, + mlp_ratio=4, + depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), + strides=(4, 2, 2, 2), + out_indices=(0, 1, 2, 3), + qkv_bias=True, + qk_scale=None, + patch_norm=True, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1, + use_abs_pos_embed=False, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + pretrain_style='official', + pretrained=None, + init_cfg=None, + with_cp=True, + output_missing_index_as_none=False, + frozen_stages=-1): + super(SwinTransformer, self).__init__() + + if isinstance(pretrain_img_size, int): + pretrain_img_size = to_2tuple(pretrain_img_size) + elif isinstance(pretrain_img_size, tuple): + if len(pretrain_img_size) == 1: + pretrain_img_size = to_2tuple(pretrain_img_size[0]) + assert len(pretrain_img_size) == 2, \ + f'The size of image should have length 1 or 2, ' \ + f'but got {len(pretrain_img_size)}' + + assert pretrain_style in ['official', 'mmcls'], 'We only support load ' + 'official ckpt and mmcls ckpt.' + + if isinstance(pretrained, str) or pretrained is None: + warnings.warn('DeprecationWarning: pretrained is a deprecated, ' + 'please use "init_cfg" instead') + else: + raise TypeError('pretrained must be a str or None') + + num_layers = len(depths) + self.out_indices = out_indices + self.use_abs_pos_embed = use_abs_pos_embed + self.pretrain_style = pretrain_style + self.pretrained = pretrained + self.init_cfg = init_cfg + + self.frozen_stages = frozen_stages + + assert strides[0] == patch_size, 'Use non-overlapping patch embed.' + + self.patch_embed = PatchEmbed( + in_channels=in_channels, + embed_dims=embed_dims, + conv_type='Conv2d', + kernel_size=patch_size, + stride=strides[0], + pad_to_patch_size=True, + norm_cfg=norm_cfg if patch_norm else None, + init_cfg=None) + + if self.use_abs_pos_embed: + patch_row = pretrain_img_size[0] // patch_size + patch_col = pretrain_img_size[1] // patch_size + num_patches = patch_row * patch_col + self.absolute_pos_embed = nn.Parameter( + torch.zeros((1, num_patches, embed_dims))) + + self.drop_after_pos = nn.Dropout(p=drop_rate) + + # stochastic depth + total_depth = sum(depths) + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, total_depth) + ] # stochastic depth decay rule + + self.stages = ModuleList() + in_channels = embed_dims + for i in range(num_layers): + if i < num_layers - 1: + downsample = PatchMerging( + in_channels=in_channels, + out_channels=2 * in_channels, + stride=strides[i + 1], + norm_cfg=norm_cfg if patch_norm else None, + init_cfg=None) + else: + downsample = None + + stage = SwinBlockSequence( + embed_dims=in_channels, + num_heads=num_heads[i], + feedforward_channels=mlp_ratio * in_channels, + depth=depths[i], + window_size=window_size, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=dpr[:depths[i]], + downsample=downsample, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + init_cfg=None, + with_cp=with_cp) + self.stages.append(stage) + + dpr = dpr[depths[i]:] + if downsample: + in_channels = downsample.out_channels + + self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] + # Add a norm layer for each output + for i in out_indices: + layer = build_norm_layer(norm_cfg, self.num_features[i])[1] + layer_name = f'norm{i}' + self.add_module(layer_name, layer) + self.output_missing_index_as_none = output_missing_index_as_none + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.use_abs_pos_embed: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.drop_after_pos.eval() + for i in range(0, self.frozen_stages - 1): + m = self.stages[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self): + if self.pretrained is None: + super().init_weights() + if self.use_abs_pos_embed: + trunc_normal_init(self.absolute_pos_embed, std=0.02) + for m in self.modules(): + if isinstance(m, Linear): + trunc_normal_init(m.weight, std=.02) + if m.bias is not None: + constant_init(m.bias, 0) + elif isinstance(m, LayerNorm): + constant_init(m.bias, 0) + constant_init(m.weight, 1.0) + elif isinstance(self.pretrained, str): + logger = get_root_logger() + ckpt = _load_checkpoint( + self.pretrained, logger=logger, map_location='cpu') + if 'state_dict' in ckpt: + state_dict = ckpt['state_dict'] + elif 'model' in ckpt: + state_dict = ckpt['model'] + else: + state_dict = ckpt + + if self.pretrain_style == 'official': + state_dict = swin_convert(state_dict) + + # strip prefix of state_dict + if list(state_dict.keys())[0].startswith('module.'): + state_dict = {k[7:]: v for k, v in state_dict.items()} + + # reshape absolute position embedding + if state_dict.get('absolute_pos_embed') is not None: + absolute_pos_embed = state_dict['absolute_pos_embed'] + N1, L, C1 = absolute_pos_embed.size() + N2, C2, H, W = self.absolute_pos_embed.size() + if N1 != N2 or C1 != C2 or L != H * W: + logger.warning('Error in loading absolute_pos_embed, pass') + else: + state_dict['absolute_pos_embed'] = absolute_pos_embed.view( + N2, H, W, C2).permute(0, 3, 1, 2).contiguous() + + # interpolate position bias table if needed + relative_position_bias_table_keys = [ + k for k in state_dict.keys() + if 'relative_position_bias_table' in k + ] + for table_key in relative_position_bias_table_keys: + table_pretrained = state_dict[table_key] + table_current = self.state_dict()[table_key] + L1, nH1 = table_pretrained.size() + L2, nH2 = table_current.size() + if nH1 != nH2: + logger.warning(f'Error in loading {table_key}, pass') + else: + if L1 != L2: + S1 = int(L1**0.5) + S2 = int(L2**0.5) + table_pretrained_resized = resize( + table_pretrained.permute(1, 0).reshape( + 1, nH1, S1, S1), + size=(S2, S2), + mode='bicubic') + state_dict[table_key] = table_pretrained_resized.view( + nH2, L2).permute(1, 0).contiguous() + + # load state_dict + self.load_state_dict(state_dict, False) + + def forward(self, x): + x = self.patch_embed(x) + + hw_shape = (self.patch_embed.DH, self.patch_embed.DW) + if self.use_abs_pos_embed: + x = x + self.absolute_pos_embed + x = self.drop_after_pos(x) + + outs = [] + for i, stage in enumerate(self.stages): + x, hw_shape, out, out_hw_shape = stage(x, hw_shape) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + out = norm_layer(out) + out = out.view(-1, *out_hw_shape, + self.num_features[i]).permute(0, 3, 1, + 2).contiguous() + outs.append(out) + elif self.output_missing_index_as_none: + outs.append(None) + return outs + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/vovnet.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/vovnet.py new file mode 100644 index 0000000000000000000000000000000000000000..879d186a37b49addaf27362cc6ae1e5465b2168e --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/vovnet.py @@ -0,0 +1,375 @@ + +from collections import OrderedDict +from mmcv.runner import BaseModule +from mmdet.models.builder import BACKBONES +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.batchnorm import _BatchNorm + + +VoVNet19_slim_dw_eSE = { + 'stem': [64, 64, 64], + 'stage_conv_ch': [64, 80, 96, 112], + 'stage_out_ch': [112, 256, 384, 512], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": True +} + +VoVNet19_dw_eSE = { + 'stem': [64, 64, 64], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": True +} + +VoVNet19_slim_eSE = { + 'stem': [64, 64, 128], + 'stage_conv_ch': [64, 80, 96, 112], + 'stage_out_ch': [112, 256, 384, 512], + 'layer_per_block': 3, + 'block_per_stage': [1, 1, 1, 1], + 'eSE': True, + "dw": False +} + +VoVNet19_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": False +} + +VoVNet39_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 1, 2, 2], + "eSE": True, + "dw": False +} + +VoVNet57_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 1, 4, 3], + "eSE": True, + "dw": False +} + +VoVNet99_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 3, 9, 3], + "eSE": True, + "dw": False +} + +_STAGE_SPECS = { + "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE, + "V-19-dw-eSE": VoVNet19_dw_eSE, + "V-19-slim-eSE": VoVNet19_slim_eSE, + "V-19-eSE": VoVNet19_eSE, + "V-39-eSE": VoVNet39_eSE, + "V-57-eSE": VoVNet57_eSE, + "V-99-eSE": VoVNet99_eSE, +} + + +def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): + """3x3 convolution with padding""" + return [ + ( + '{}_{}/dw_conv3x3'.format(module_name, postfix), + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=out_channels, + bias=False + ) + ), + ( + '{}_{}/pw_conv1x1'.format(module_name, postfix), + nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False) + ), + ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)), + ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)), + ] + + +def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): + """3x3 convolution with padding""" + return [ + ( + f"{module_name}_{postfix}/conv", + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ), + ), + (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), + (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + ] + + +def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): + """1x1 convolution with padding""" + return [ + ( + f"{module_name}_{postfix}/conv", + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ), + ), + (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), + (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + ] + + +class Hsigmoid(nn.Module): + def __init__(self, inplace=True): + super(Hsigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + return F.relu6(x + 3.0, inplace=self.inplace) / 6.0 + + +class eSEModule(nn.Module): + def __init__(self, channel, reduction=4): + super(eSEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0) + self.hsigmoid = Hsigmoid() + + def forward(self, x): + input = x + x = self.avg_pool(x) + x = self.fc(x) + x = self.hsigmoid(x) + return input * x + + +class _OSA_module(nn.Module): + def __init__( + self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False + ): + + super(_OSA_module, self).__init__() + + self.identity = identity + self.depthwise = depthwise + self.isReduced = False + self.layers = nn.ModuleList() + in_channel = in_ch + if self.depthwise and in_channel != stage_ch: + self.isReduced = True + self.conv_reduction = nn.Sequential( + OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0")) + ) + for i in range(layer_per_block): + if self.depthwise: + self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i)))) + else: + self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) + in_channel = stage_ch + + # feature aggregation + in_channel = in_ch + layer_per_block * stage_ch + self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat"))) + + self.ese = eSEModule(concat_ch) + + def forward(self, x): + + identity_feat = x + + output = [] + output.append(x) + if self.depthwise and self.isReduced: + x = self.conv_reduction(x) + for layer in self.layers: + x = layer(x) + output.append(x) + + x = torch.cat(output, dim=1) + xt = self.concat(x) + + xt = self.ese(xt) + + if self.identity: + xt = xt + identity_feat + + return xt + + +class _OSA_stage(nn.Sequential): + def __init__( + self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False + ): + + super(_OSA_stage, self).__init__() + + if not stage_num == 2: + self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) + + if block_per_stage != 1: + SE = False + module_name = f"OSA{stage_num}_1" + self.add_module( + module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise) + ) + for i in range(block_per_stage - 1): + if i != block_per_stage - 2: # last block + SE = False + module_name = f"OSA{stage_num}_{i + 2}" + self.add_module( + module_name, + _OSA_module( + concat_ch, + stage_ch, + concat_ch, + layer_per_block, + module_name, + SE, + identity=True, + depthwise=depthwise + ), + ) + + +@BACKBONES.register_module() +class VoVNet(BaseModule): + def __init__(self, spec_name, input_ch=3, out_features=None, + frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None): + """ + Args: + input_ch(int) : the number of input channel + out_features (list[str]): name of the layers whose outputs should + be returned in forward. Can be anything in "stem", "stage2" ... + """ + super(VoVNet, self).__init__(init_cfg) + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + stage_specs = _STAGE_SPECS[spec_name] + + stem_ch = stage_specs["stem"] + config_stage_ch = stage_specs["stage_conv_ch"] + config_concat_ch = stage_specs["stage_out_ch"] + block_per_stage = stage_specs["block_per_stage"] + layer_per_block = stage_specs["layer_per_block"] + SE = stage_specs["eSE"] + depthwise = stage_specs["dw"] + + self._out_features = out_features + + # Stem module + conv_type = dw_conv3x3 if depthwise else conv3x3 + stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2) + stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1) + stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2) + self.add_module("stem", nn.Sequential((OrderedDict(stem)))) + current_stirde = 4 + self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} + self._out_feature_channels = {"stem": stem_ch[2]} + + stem_out_ch = [stem_ch[2]] + in_ch_list = stem_out_ch + config_concat_ch[:-1] + # OSA stages + self.stage_names = [] + for i in range(4): # num_stages + name = "stage%d" % (i + 2) # stage 2 ... stage 5 + self.stage_names.append(name) + self.add_module( + name, + _OSA_stage( + in_ch_list[i], + config_stage_ch[i], + config_concat_ch[i], + block_per_stage[i], + layer_per_block, + i + 2, + SE, + depthwise, + ), + ) + + self._out_feature_channels[name] = config_concat_ch[i] + if not i == 0: + self._out_feature_strides[name] = current_stirde = int(current_stirde * 2) + + # initialize weights + # self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + + def forward(self, x): + outputs = {} + x = self.stem(x) + if "stem" in self._out_features: + outputs["stem"] = x + for name in self.stage_names: + x = getattr(self, name)(x) + if name in self._out_features: + outputs[name] = x + + return outputs + + def _freeze_stages(self): + if self.frozen_stages >= 0: + m = getattr(self, 'stem') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'stage{i+1}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(VoVNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93b13c9c853d6f7eece8ae2dc7aa67d4e87db68b --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/__init__.py @@ -0,0 +1 @@ +from .hooks import GradChecker \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/hooks.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..56ff7fd575c890e60ce49eb618df157b2cc2ca37 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/hooks.py @@ -0,0 +1,13 @@ +from mmcv.runner.hooks.hook import HOOKS, Hook +from projects.mmdet3d_plugin.models.utils import run_time + + +@HOOKS.register_module() +class GradChecker(Hook): + + def after_train_iter(self, runner): + for key, val in runner.model.named_parameters(): + if val.grad == None and val.requires_grad: + print('WARNNING: {key}\'s parameters are not be used!!!!'.format(key=key)) + + diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7dd426868a61772bbe0926e435ce89f15009805 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/__init__.py @@ -0,0 +1 @@ +from .adamw import AdamW2 \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/adamw.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/adamw.py new file mode 100644 index 0000000000000000000000000000000000000000..c890aeaf04721580c11ca329f2be09a6a280f773 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/adamw.py @@ -0,0 +1,131 @@ +try: + from torch.optim import _functional as F +except: + print('WARNING!!!, I recommend using torch>=1.8') + +import torch +from torch.optim.optimizer import Optimizer +from mmcv.runner.optimizer.builder import OPTIMIZERS + +@OPTIMIZERS.register_module() +class AdamW2(Optimizer): + r"""Implements AdamW algorithm. Solve the bug of torch 1.8 + + The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. + The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay coefficient (default: 1e-2) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _Decoupled Weight Decay Regularization: + https://arxiv.org/abs/1711.05101 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=1e-2, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + super(AdamW2, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW2, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + state_sums = [] + max_exp_avg_sqs = [] + state_steps = [] + amsgrad = group['amsgrad'] + + # put this line here for solving bug + beta1, beta2 = group['betas'] + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + if p.grad.is_sparse: + raise RuntimeError('AdamW does not support sparse gradients') + grads.append(p.grad) + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + exp_avgs.append(state['exp_avg']) + exp_avg_sqs.append(state['exp_avg_sq']) + + if amsgrad: + max_exp_avg_sqs.append(state['max_exp_avg_sq']) + + + # update the steps for each param group update + state['step'] += 1 + # record the step after step update + state_steps.append(state['step']) + + F.adamw(params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad, + beta1, + beta2, + group['lr'], + group['weight_decay'], + group['eps']) + + return loss \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..921aee7c215a2e2546e943589a85949c2eee7b05 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/__init__.py @@ -0,0 +1,10 @@ + +from .bricks import run_time +from .grid_mask import GridMask +from .position_embedding import RelPositionEmbedding +from .visual import save_tensor +from .inverted_residual import InvertedResidual +from .se_layer import DyReLU, SELayer +from .make_divisible import make_divisible +from .ckpt_convert import swin_convert, vit_convert +from .embed import PatchEmbed \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/bricks.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..fd458813d9ffced23b79799daa84150ba887774e --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/bricks.py @@ -0,0 +1,20 @@ +import functools +import time +from collections import defaultdict +import torch +time_maps = defaultdict(lambda :0.) +count_maps = defaultdict(lambda :0.) +def run_time(name): + def middle(fn): + def wrapper(*args, **kwargs): + torch.cuda.synchronize() + start = time.time() + res = fn(*args, **kwargs) + torch.cuda.synchronize() + time_maps['%s : %s'%(name, fn.__name__) ] += time.time()-start + count_maps['%s : %s'%(name, fn.__name__) ] +=1 + print("%s : %s takes up %f "% (name, fn.__name__,time_maps['%s : %s'%(name, fn.__name__) ] /count_maps['%s : %s'%(name, fn.__name__) ] )) + return res + return wrapper + return middle + \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/ckpt_convert.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/ckpt_convert.py new file mode 100644 index 0000000000000000000000000000000000000000..fd4632065ce2109376bebef36e0532c5115125f8 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/ckpt_convert.py @@ -0,0 +1,91 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + + +def swin_convert(ckpt): + new_ckpt = OrderedDict() + + def correct_unfold_reduction_order(x): + out_channel, in_channel = x.shape + x = x.reshape(out_channel, 4, in_channel // 4) + x = x[:, [0, 2, 1, 3], :].transpose(1, + 2).reshape(out_channel, in_channel) + return x + + def correct_unfold_norm_order(x): + in_channel = x.shape[0] + x = x.reshape(4, in_channel // 4) + x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel) + return x + + for k, v in ckpt.items(): + if k.startswith('head'): + continue + elif k.startswith('layers'): + new_v = v + if 'attn.' in k: + new_k = k.replace('attn.', 'attn.w_msa.') + elif 'mlp.' in k: + if 'mlp.fc1.' in k: + new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.') + elif 'mlp.fc2.' in k: + new_k = k.replace('mlp.fc2.', 'ffn.layers.1.') + else: + new_k = k.replace('mlp.', 'ffn.') + elif 'downsample' in k: + new_k = k + if 'reduction.' in k: + new_v = correct_unfold_reduction_order(v) + elif 'norm.' in k: + new_v = correct_unfold_norm_order(v) + else: + new_k = k + new_k = new_k.replace('layers', 'stages', 1) + elif k.startswith('patch_embed'): + new_v = v + if 'proj' in k: + new_k = k.replace('proj', 'projection') + else: + new_k = k + else: + new_v = v + new_k = k + + new_ckpt[new_k] = new_v + + return new_ckpt + + +def vit_convert(ckpt): + + new_ckpt = OrderedDict() + + for k, v in ckpt.items(): + if k.startswith('head'): + continue + if k.startswith('norm'): + new_k = k.replace('norm.', 'ln1.') + elif k.startswith('patch_embed'): + if 'proj' in k: + new_k = k.replace('proj', 'projection') + else: + new_k = k + elif k.startswith('blocks'): + if 'norm' in k: + new_k = k.replace('norm', 'ln') + elif 'mlp.fc1' in k: + new_k = k.replace('mlp.fc1', 'ffn.layers.0.0') + elif 'mlp.fc2' in k: + new_k = k.replace('mlp.fc2', 'ffn.layers.1') + elif 'attn.qkv' in k: + new_k = k.replace('attn.qkv.', 'attn.attn.in_proj_') + elif 'attn.proj' in k: + new_k = k.replace('attn.proj', 'attn.attn.out_proj') + else: + new_k = k + new_k = new_k.replace('blocks.', 'layers.') + else: + new_k = k + new_ckpt[new_k] = v + + return new_ckpt diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/embed.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/embed.py new file mode 100644 index 0000000000000000000000000000000000000000..c0cf143488eafcd1dcd9e80f824807aa47de34fd --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/embed.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn.functional as F +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.runner.base_module import BaseModule +from torch.nn.modules.utils import _pair as to_2tuple + + +# Modified from Pytorch-Image-Models +class PatchEmbed(BaseModule): + """Image to Patch Embedding V2. + + We use a conv layer to implement PatchEmbed. + Args: + in_channels (int): The num of input channels. Default: 3 + embed_dims (int): The dimensions of embedding. Default: 768 + conv_type (dict, optional): The config dict for conv layers type + selection. Default: None. + kernel_size (int): The kernel_size of embedding conv. Default: 16. + stride (int): The slide stride of embedding conv. + Default: None (Default to be equal with kernel_size). + padding (int): The padding length of embedding conv. Default: 0. + dilation (int): The dilation rate of embedding conv. Default: 1. + pad_to_patch_size (bool, optional): Whether to pad feature map shape + to multiple patch size. Default: True. + norm_cfg (dict, optional): Config dict for normalization layer. + init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels=3, + embed_dims=768, + conv_type=None, + kernel_size=16, + stride=16, + padding=0, + dilation=1, + pad_to_patch_size=True, + norm_cfg=None, + init_cfg=None): + super(PatchEmbed, self).__init__() + + self.embed_dims = embed_dims + self.init_cfg = init_cfg + + if stride is None: + stride = kernel_size + + self.pad_to_patch_size = pad_to_patch_size + + # The default setting of patch size is equal to kernel size. + patch_size = kernel_size + if isinstance(patch_size, int): + patch_size = to_2tuple(patch_size) + elif isinstance(patch_size, tuple): + if len(patch_size) == 1: + patch_size = to_2tuple(patch_size[0]) + assert len(patch_size) == 2, \ + f'The size of patch should have length 1 or 2, ' \ + f'but got {len(patch_size)}' + + self.patch_size = patch_size + + # Use conv layer to embed + conv_type = conv_type or 'Conv2d' + self.projection = build_conv_layer( + dict(type=conv_type), + in_channels=in_channels, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + def forward(self, x): + H, W = x.shape[2], x.shape[3] + + # TODO: Process overlapping op + if self.pad_to_patch_size: + # Modify H, W to multiple of patch size. + if H % self.patch_size[0] != 0: + x = F.pad( + x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + if W % self.patch_size[1] != 0: + x = F.pad( + x, (0, self.patch_size[1] - W % self.patch_size[1], 0, 0)) + + x = self.projection(x) + self.DH, self.DW = x.shape[2], x.shape[3] + x = x.flatten(2).transpose(1, 2) + + if self.norm is not None: + x = self.norm(x) + + return x diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/grid_mask.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/grid_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..3d04b2c470a24b55fd5a60ca6c679fa9710bc1a7 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/grid_mask.py @@ -0,0 +1,124 @@ +import torch +import torch.nn as nn +import numpy as np +from PIL import Image +from mmcv.runner import force_fp32, auto_fp16 + +class Grid(object): + def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): + self.use_h = use_h + self.use_w = use_w + self.rotate = rotate + self.offset = offset + self.ratio = ratio + self.mode=mode + self.st_prob = prob + self.prob = prob + + def set_prob(self, epoch, max_epoch): + self.prob = self.st_prob * epoch / max_epoch + + def __call__(self, img, label): + if np.random.rand() > self.prob: + return img, label + h = img.size(1) + w = img.size(2) + self.d1 = 2 + self.d2 = min(h, w) + hh = int(1.5*h) + ww = int(1.5*w) + d = np.random.randint(self.d1, self.d2) + if self.ratio == 1: + self.l = np.random.randint(1, d) + else: + self.l = min(max(int(d*self.ratio+0.5),1),d-1) + mask = np.ones((hh, ww), np.float32) + st_h = np.random.randint(d) + st_w = np.random.randint(d) + if self.use_h: + for i in range(hh//d): + s = d*i + st_h + t = min(s+self.l, hh) + mask[s:t,:] *= 0 + if self.use_w: + for i in range(ww//d): + s = d*i + st_w + t = min(s+self.l, ww) + mask[:,s:t] *= 0 + + r = np.random.randint(self.rotate) + mask = Image.fromarray(np.uint8(mask)) + mask = mask.rotate(r) + mask = np.asarray(mask) + mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] + + mask = torch.from_numpy(mask).float() + if self.mode == 1: + mask = 1-mask + + mask = mask.expand_as(img) + if self.offset: + offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float() + offset = (1 - mask) * offset + img = img * mask + offset + else: + img = img * mask + + return img, label + + +class GridMask(nn.Module): + def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): + super(GridMask, self).__init__() + self.use_h = use_h + self.use_w = use_w + self.rotate = rotate + self.offset = offset + self.ratio = ratio + self.mode = mode + self.st_prob = prob + self.prob = prob + self.fp16_enable = False + def set_prob(self, epoch, max_epoch): + self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5 + @auto_fp16() + def forward(self, x): + if np.random.rand() > self.prob or not self.training: + return x + n,c,h,w = x.size() + x = x.view(-1,h,w) + hh = int(1.5*h) + ww = int(1.5*w) + d = np.random.randint(2, h) + self.l = min(max(int(d*self.ratio+0.5),1),d-1) + mask = np.ones((hh, ww), np.float32) + st_h = np.random.randint(d) + st_w = np.random.randint(d) + if self.use_h: + for i in range(hh//d): + s = d*i + st_h + t = min(s+self.l, hh) + mask[s:t,:] *= 0 + if self.use_w: + for i in range(ww//d): + s = d*i + st_w + t = min(s+self.l, ww) + mask[:,s:t] *= 0 + + r = np.random.randint(self.rotate) + mask = Image.fromarray(np.uint8(mask)) + mask = mask.rotate(r) + mask = np.asarray(mask) + mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] + + mask = torch.from_numpy(mask).to(x.dtype).cuda() + if self.mode == 1: + mask = 1-mask + mask = mask.expand_as(x) + if self.offset: + offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda() + x = x * mask + offset * (1 - mask) + else: + x = x * mask + + return x.view(n,c,h,w) \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/inverted_residual.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/inverted_residual.py new file mode 100644 index 0000000000000000000000000000000000000000..093c8efe85a24ea26c9fd18d6b9cd14c7ce68779 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/inverted_residual.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule +from mmcv.cnn.bricks import DropPath +from mmcv.runner import BaseModule + +from .se_layer import SELayer + + +class InvertedResidual(BaseModule): + """Inverted Residual Block. + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + mid_channels (int): The input channels of the depthwise convolution. + kernel_size (int): The kernel size of the depthwise convolution. + Default: 3. + stride (int): The stride of the depthwise convolution. Default: 1. + se_cfg (dict): Config dict for se layer. Default: None, which means no + se layer. + with_expand_conv (bool): Use expand conv or not. If set False, + mid_channels must be the same with in_channels. + Default: True. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + drop_path_rate (float): stochastic depth rate. Defaults to 0. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_expand_conv=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + drop_path_rate=0., + with_cp=False, + init_cfg=None): + super(InvertedResidual, self).__init__(init_cfg) + self.with_res_shortcut = (stride == 1 and in_channels == out_channels) + assert stride in [1, 2], f'stride must in [1, 2]. ' \ + f'But received {stride}.' + self.with_cp = with_cp + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.with_se = se_cfg is not None + self.with_expand_conv = with_expand_conv + + if self.with_se: + assert isinstance(se_cfg, dict) + if not self.with_expand_conv: + assert mid_channels == in_channels + + if self.with_expand_conv: + self.expand_conv = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.depthwise_conv = ConvModule( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=kernel_size // 2, + groups=mid_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if self.with_se: + self.se = SELayer(**se_cfg) + + self.linear_conv = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + + if self.with_expand_conv: + out = self.expand_conv(out) + + out = self.depthwise_conv(out) + + if self.with_se: + out = self.se(out) + + out = self.linear_conv(out) + + if self.with_res_shortcut: + return x + self.drop_path(out) + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/make_divisible.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/make_divisible.py new file mode 100644 index 0000000000000000000000000000000000000000..5bbc0a6c5b7cef48a9171ef4870201f2aa6cad85 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/make_divisible.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +def make_divisible(value, divisor, min_value=None, min_ratio=0.9): + """Make divisible function. + This function rounds the channel number to the nearest value that can be + divisible by the divisor. It is taken from the original tf repo. It ensures + that all layers have a channel number that is divisible by divisor. It can + be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa + Args: + value (int): The original channel number. + divisor (int): The divisor to fully divide the channel number. + min_value (int): The minimum value of the output channel. + Default: None, means that the minimum value equal to the divisor. + min_ratio (float): The minimum ratio of the rounded channel number to + the original channel number. Default: 0.9. + Returns: + int: The modified output channel number. + """ + + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than (1-min_ratio). + if new_value < min_ratio * value: + new_value += divisor + return new_value \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/position_embedding.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/position_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..290110fef7cb86c5edafb0b33da3bed794e6f7a9 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/position_embedding.py @@ -0,0 +1,34 @@ +import torch +import torch.nn as nn +import math + +class RelPositionEmbedding(nn.Module): + def __init__(self, num_pos_feats=64, pos_norm=True): + super().__init__() + self.num_pos_feats = num_pos_feats + self.fc = nn.Linear(4, self.num_pos_feats,bias=False) + #nn.init.orthogonal_(self.fc.weight) + #self.fc.weight.requires_grad = False + self.pos_norm = pos_norm + if self.pos_norm: + self.norm = nn.LayerNorm(self.num_pos_feats) + def forward(self, tensor): + #mask = nesttensor.mask + B,C,H,W = tensor.shape + #print('tensor.shape', tensor.shape) + y_range = (torch.arange(H) / float(H - 1)).to(tensor.device) + #y_axis = torch.stack((y_range, 1-y_range),dim=1) + y_axis = torch.stack((torch.cos(y_range * math.pi), torch.sin(y_range * math.pi)), dim=1) + y_axis = y_axis.reshape(H, 1, 2).repeat(1, W, 1).reshape(H * W, 2) + + x_range = (torch.arange(W) / float(W - 1)).to(tensor.device) + #x_axis =torch.stack((x_range,1-x_range),dim=1) + x_axis = torch.stack((torch.cos(x_range * math.pi), torch.sin(x_range * math.pi)), dim=1) + x_axis = x_axis.reshape(1, W, 2).repeat(H, 1, 1).reshape(H * W, 2) + x_pos = torch.cat((y_axis, x_axis), dim=1) + x_pos = self.fc(x_pos) + + if self.pos_norm: + x_pos = self.norm(x_pos) + #print('xpos,', x_pos.max(),x_pos.min()) + return x_pos \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/se_layer.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/se_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..86d10615f5a8606909af4e6af4163cf97ad73cc1 --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/se_layer.py @@ -0,0 +1,124 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.runner import BaseModule + + +class SELayer(BaseModule): + """Squeeze-and-Excitation Module. + Args: + channels (int): The input (and output) channels of the SE layer. + ratio (int): Squeeze ratio in SELayer, the intermediate channel will be + ``int(channels/ratio)``. Default: 16. + conv_cfg (None or dict): Config dict for convolution layer. + Default: None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configurated + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configurated by the first dict and the + second activation layer will be configurated by the second dict. + Default: (dict(type='ReLU'), dict(type='Sigmoid')) + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + channels, + ratio=16, + conv_cfg=None, + act_cfg=(dict(type='ReLU'), dict(type='Sigmoid')), + init_cfg=None): + super(SELayer, self).__init__(init_cfg) + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=int(channels / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(channels / ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + out = self.global_avgpool(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out + + +class DyReLU(BaseModule): + """Dynamic ReLU (DyReLU) module. + See `Dynamic ReLU `_ for details. + Current implementation is specialized for task-aware attention in DyHead. + HSigmoid arguments in default act_cfg follow DyHead official code. + https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py + Args: + channels (int): The input (and output) channels of DyReLU module. + ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module, + the intermediate channel will be ``int(channels/ratio)``. + Default: 4. + conv_cfg (None or dict): Config dict for convolution layer. + Default: None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configurated + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configurated by the first dict and the + second activation layer will be configurated by the second dict. + Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, + divisor=6.0)) + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + channels, + ratio=4, + conv_cfg=None, + act_cfg=(dict(type='ReLU'), + dict(type='HSigmoid', bias=3.0, divisor=6.0)), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.channels = channels + self.expansion = 4 # for a1, b1, a2, b2 + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=int(channels / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(channels / ratio), + out_channels=channels * self.expansion, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + """Forward function.""" + coeffs = self.global_avgpool(x) + coeffs = self.conv1(coeffs) + coeffs = self.conv2(coeffs) - 0.5 # value range: [-0.5, 0.5] + a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1) + a1 = a1 * 2.0 + 1.0 # [-1.0, 1.0] + 1.0 + a2 = a2 * 2.0 # [-1.0, 1.0] + out = torch.max(x * a1 + b1, x * a2 + b2) + return \ No newline at end of file diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/visual.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/visual.py new file mode 100644 index 0000000000000000000000000000000000000000..f9718afea9e67199c77da8ecf33249a28197082a --- /dev/null +++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/visual.py @@ -0,0 +1,24 @@ +import torch +from torchvision.utils import make_grid +import torchvision +import matplotlib.pyplot as plt +import cv2 + + +def convert_color(img_path): + plt.figure() + img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) + plt.imsave(img_path, img, cmap=plt.get_cmap('viridis')) + plt.close() + + +def save_tensor(tensor, path, pad_value=254.0,): + print('save_tensor', path) + tensor = tensor.to(torch.float).detach().cpu() + if tensor.type() == 'torch.BoolTensor': + tensor = tensor*255 + if len(tensor.shape) == 3: + tensor = tensor.unsqueeze(1) + tensor = make_grid(tensor, pad_value=pad_value, normalize=False).permute(1, 2, 0).numpy().copy() + torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path) + convert_color(path) diff --git a/model_examples/MapTR/replace_patch.sh b/model_examples/MapTR/replace_patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..b55c28482d67d5a4f3fe6546932ad14e7be0ed0c --- /dev/null +++ b/model_examples/MapTR/replace_patch.sh @@ -0,0 +1,16 @@ +#!/bin/bash +for para in $* +do + if [[ $para == --packages_path* ]];then + packages_path=`echo ${para#*=}` + fi +done + +cp -f patch/mmcv/distributed.py mmcv/mmcv/parallel/distributed.py +cp -f patch/mmcv/_functions.py mmcv/mmcv/parallel/_functions.py +cp -f patch/mmcv/deform_conv.py mmcv/mmcv/ops/deform_conv.py +cp -f patch/mmdet3d/__init__.py mmdetection3d/mmdet3d/__init__.py +cp -f patch/mmdet/__init__.py ${packages_path}/mmdet/__init__.py +cp -f patch/mmdet/resnet.py ${packages_path}/mmdet/models/backbones/resnet.py +cp -f patch/mmseg/__init__.py ${packages_path}/mmseg/__init__.py +cp -f patch/nuscenes/data_classes.py ${packages_path}/nuscenes/eval/detection/data_classes.py \ No newline at end of file diff --git a/model_examples/MapTR/requirement.txt b/model_examples/MapTR/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfe7951695fd01b89fdd4d94ab6d5aa886a5f6de --- /dev/null +++ b/model_examples/MapTR/requirement.txt @@ -0,0 +1,10 @@ +torchaudio==0.10.0 +torchvision==0.10.0 +mmdet==2.14.0 +mmsegmentation==0.14.1 +timm +shapely==1.8.5.post1 +av2 +numba==0.58.1 +numpy==1.23.0 +ipython==8.12.3 \ No newline at end of file diff --git a/model_examples/MapTR/test/env_npu.sh b/model_examples/MapTR/test/env_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..d4195f46ca9e8de1dd2387a8fe618b48eaaf8498 --- /dev/null +++ b/model_examples/MapTR/test/env_npu.sh @@ -0,0 +1,54 @@ +#!/bin/bash +CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' + +if [ -f CANN_INSTALL_PATH_CONF ]; then + CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2) +else + CANN_INSTALL_PATH="/usr/local/Ascend" +fi + +if [ -d {CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then + source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh +else + source ${CANN_INSTALL_PATH}/nnae/set_env.sh +fi + +# 绑核 +export CPU_AFFINITY_CONF=1 +# 是否开启taskque +export TASK_QUEUE_ENABLE=2 +# 设置shape数据缓存 +export HOST_CACHE_CAPACITY=20 +# 开启combined标志 +export COMBINED_ENABLE=1 +# 启用可扩展内存段分配策略 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +#设置device侧日志登记为error +msnpureport -g error -d 0 +msnpureport -g error -d 1 +msnpureport -g error -d 2 +msnpureport -g error -d 3 +msnpureport -g error -d 4 +msnpureport -g error -d 5 +msnpureport -g error -d 6 +msnpureport -g error -d 7 +#关闭Device侧Event日志 +msnpureport -e disable + +path_lib=$(python3 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) diff --git a/model_examples/MapTR/test/train_8p.sh b/model_examples/MapTR/test/train_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..defe40e8a9d99010e673694a95a88f34ed875455 --- /dev/null +++ b/model_examples/MapTR/test/train_8p.sh @@ -0,0 +1,84 @@ +#!/bin/bash +################基础配置参数,需要模型审视修改################## +# 网络名称,同目录名称 +Network="MapTR" +WORLD_SIZE=8 +WORK_DIR="" +LOAD_FROM="" + +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=$(pwd) +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=$(pwd) +else + test_path_dir=${cur_path}/test +fi + +ASCEND_DEVICE_ID=0 + +if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ]; then + rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID} +else + mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID} +fi + +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=$(env | grep etp_running_flag) +etp_flag=$(echo ${acheck_etp_flag#*=}) +if [ x"${etp_flag}" != x"true" ]; then + source ${test_path_dir}/env_npu.sh +fi + +bash ./tools/dist_train.sh ./projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py 8 \ + >$cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +wait + + +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 训练用例信息,不需要修改 +BatchSize=8 +DeviceType=$(uname -m) +CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc' + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +avg_time=`grep -a 'mmdet - INFO - Epoch ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +Iteration_time=$avg_time +# 打印,不需要修改 +echo "Iteration time : $Iteration_time" + +# 输出训练精度mAP,需要模型审视修改 +mAP=$(grep -a "mmdet - INFO - Epoch(val)" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |tail -1|awk -F "mAP: " '{print $2}' |awk -F ", " '{print $1}'| awk -F ", " '{print $1}') + +# 打印,不需要修改 +echo "mAP : ${mAP}" +echo "E2E Training Duration sec : $e2e_time" + +# 训练总时长 +TrainingTime=`grep -a 'Time' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "Time: " '{print $2}'|awk -F "," '{print $1}'| awk '{a+=$1} END {printf("%.3f",a)}'` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainmAP = ${mAP}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log +echo "Iteration time = ${Iteration_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log +echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log +echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log \ No newline at end of file diff --git a/model_examples/MapTR/test/train_8p_performance.sh b/model_examples/MapTR/test/train_8p_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..defe40e8a9d99010e673694a95a88f34ed875455 --- /dev/null +++ b/model_examples/MapTR/test/train_8p_performance.sh @@ -0,0 +1,84 @@ +#!/bin/bash +################基础配置参数,需要模型审视修改################## +# 网络名称,同目录名称 +Network="MapTR" +WORLD_SIZE=8 +WORK_DIR="" +LOAD_FROM="" + +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=$(pwd) +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=$(pwd) +else + test_path_dir=${cur_path}/test +fi + +ASCEND_DEVICE_ID=0 + +if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ]; then + rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID} +else + mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID} +fi + +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=$(env | grep etp_running_flag) +etp_flag=$(echo ${acheck_etp_flag#*=}) +if [ x"${etp_flag}" != x"true" ]; then + source ${test_path_dir}/env_npu.sh +fi + +bash ./tools/dist_train.sh ./projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py 8 \ + >$cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +wait + + +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 训练用例信息,不需要修改 +BatchSize=8 +DeviceType=$(uname -m) +CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc' + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +avg_time=`grep -a 'mmdet - INFO - Epoch ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +Iteration_time=$avg_time +# 打印,不需要修改 +echo "Iteration time : $Iteration_time" + +# 输出训练精度mAP,需要模型审视修改 +mAP=$(grep -a "mmdet - INFO - Epoch(val)" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |tail -1|awk -F "mAP: " '{print $2}' |awk -F ", " '{print $1}'| awk -F ", " '{print $1}') + +# 打印,不需要修改 +echo "mAP : ${mAP}" +echo "E2E Training Duration sec : $e2e_time" + +# 训练总时长 +TrainingTime=`grep -a 'Time' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "Time: " '{print $2}'|awk -F "," '{print $1}'| awk '{a+=$1} END {printf("%.3f",a)}'` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainmAP = ${mAP}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log +echo "Iteration time = ${Iteration_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log +echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log +echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log \ No newline at end of file diff --git a/model_examples/MapTR/tools/analysis_tools/__init__.py b/model_examples/MapTR/tools/analysis_tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model_examples/MapTR/tools/analysis_tools/analyze_logs.py b/model_examples/MapTR/tools/analysis_tools/analyze_logs.py new file mode 100644 index 0000000000000000000000000000000000000000..806175f34c0ce6c535167cc7db8470c69a6e243d --- /dev/null +++ b/model_examples/MapTR/tools/analysis_tools/analyze_logs.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import numpy as np +import seaborn as sns +from collections import defaultdict +from matplotlib import pyplot as plt + + +def cal_train_time(log_dicts, args): + for i, log_dict in enumerate(log_dicts): + print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}') + all_times = [] + for epoch in log_dict.keys(): + if args.include_outliers: + all_times.append(log_dict[epoch]['time']) + else: + all_times.append(log_dict[epoch]['time'][1:]) + all_times = np.array(all_times) + epoch_ave_time = all_times.mean(-1) + slowest_epoch = epoch_ave_time.argmax() + fastest_epoch = epoch_ave_time.argmin() + std_over_epoch = epoch_ave_time.std() + print(f'slowest epoch {slowest_epoch + 1}, ' + f'average time is {epoch_ave_time[slowest_epoch]:.4f}') + print(f'fastest epoch {fastest_epoch + 1}, ' + f'average time is {epoch_ave_time[fastest_epoch]:.4f}') + print(f'time std over epochs is {std_over_epoch:.4f}') + print(f'average iter time: {np.mean(all_times):.4f} s/iter') + print() + + +def plot_curve(log_dicts, args): + if args.backend is not None: + plt.switch_backend(args.backend) + sns.set_style(args.style) + # if legend is None, use {filename}_{key} as legend + legend = args.legend + if legend is None: + legend = [] + for json_log in args.json_logs: + for metric in args.keys: + legend.append(f'{json_log}_{metric}') + assert len(legend) == (len(args.json_logs) * len(args.keys)) + metrics = args.keys + + num_metrics = len(metrics) + for i, log_dict in enumerate(log_dicts): + epochs = list(log_dict.keys()) + for j, metric in enumerate(metrics): + print(f'plot curve of {args.json_logs[i]}, metric is {metric}') + if metric not in log_dict[epochs[args.interval - 1]]: + raise KeyError( + f'{args.json_logs[i]} does not contain metric {metric}') + + if args.mode == 'eval': + if min(epochs) == args.interval: + x0 = args.interval + else: + # if current training is resumed from previous checkpoint + # we lost information in early epochs + # `xs` should start according to `min(epochs)` + if min(epochs) % args.interval == 0: + x0 = min(epochs) + else: + # find the first epoch that do eval + x0 = min(epochs) + args.interval - \ + min(epochs) % args.interval + xs = np.arange(x0, max(epochs) + 1, args.interval) + ys = [] + for epoch in epochs[args.interval - 1::args.interval]: + ys += log_dict[epoch][metric] + + # if training is aborted before eval of the last epoch + # `xs` and `ys` will have different length and cause an error + # check if `ys[-1]` is empty here + if not log_dict[epoch][metric]: + xs = xs[:-1] + + ax = plt.gca() + ax.set_xticks(xs) + plt.xlabel('epoch') + plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o') + else: + xs = [] + ys = [] + num_iters_per_epoch = \ + log_dict[epochs[args.interval-1]]['iter'][-1] + for epoch in epochs[args.interval - 1::args.interval]: + iters = log_dict[epoch]['iter'] + if log_dict[epoch]['mode'][-1] == 'val': + iters = iters[:-1] + xs.append( + np.array(iters) + (epoch - 1) * num_iters_per_epoch) + ys.append(np.array(log_dict[epoch][metric][:len(iters)])) + xs = np.concatenate(xs) + ys = np.concatenate(ys) + plt.xlabel('iter') + plt.plot( + xs, ys, label=legend[i * num_metrics + j], linewidth=0.5) + plt.legend() + if args.title is not None: + plt.title(args.title) + if args.out is None: + plt.show() + else: + print(f'save curve to: {args.out}') + plt.savefig(args.out) + plt.cla() + + +def add_plot_parser(subparsers): + parser_plt = subparsers.add_parser( + 'plot_curve', help='parser for plotting curves') + parser_plt.add_argument( + 'json_logs', + type=str, + nargs='+', + help='path of train log in json format') + parser_plt.add_argument( + '--keys', + type=str, + nargs='+', + default=['mAP_0.25'], + help='the metric that you want to plot') + parser_plt.add_argument('--title', type=str, help='title of figure') + parser_plt.add_argument( + '--legend', + type=str, + nargs='+', + default=None, + help='legend of each plot') + parser_plt.add_argument( + '--backend', type=str, default=None, help='backend of plt') + parser_plt.add_argument( + '--style', type=str, default='dark', help='style of plt') + parser_plt.add_argument('--out', type=str, default=None) + parser_plt.add_argument('--mode', type=str, default='train') + parser_plt.add_argument('--interval', type=int, default=1) + + +def add_time_parser(subparsers): + parser_time = subparsers.add_parser( + 'cal_train_time', + help='parser for computing the average time per training iteration') + parser_time.add_argument( + 'json_logs', + type=str, + nargs='+', + help='path of train log in json format') + parser_time.add_argument( + '--include-outliers', + action='store_true', + help='include the first value of every epoch when computing ' + 'the average time') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Analyze Json Log') + # currently only support plot curve and calculate average train time + subparsers = parser.add_subparsers(dest='task', help='task parser') + add_plot_parser(subparsers) + add_time_parser(subparsers) + args = parser.parse_args() + return args + + +def load_json_logs(json_logs): + # load and convert json_logs to log_dict, key is epoch, value is a sub dict + # keys of sub dict is different metrics, e.g. memory, bbox_mAP + # value of sub dict is a list of corresponding values of all iterations + log_dicts = [dict() for _ in json_logs] + for json_log, log_dict in zip(json_logs, log_dicts): + with open(json_log, 'r') as log_file: + for line in log_file: + log = json.loads(line.strip()) + # skip lines without `epoch` field + if 'epoch' not in log: + continue + epoch = log.pop('epoch') + if epoch not in log_dict: + log_dict[epoch] = defaultdict(list) + for k, v in log.items(): + log_dict[epoch][k].append(v) + return log_dicts + + +def main(): + args = parse_args() + + json_logs = args.json_logs + for json_log in json_logs: + assert json_log.endswith('.json') + + log_dicts = load_json_logs(json_logs) + + eval(args.task)(log_dicts, args) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/analysis_tools/benchmark.py b/model_examples/MapTR/tools/analysis_tools/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..487a348935e3c949a8cde2c90a1747db769964c9 --- /dev/null +++ b/model_examples/MapTR/tools/analysis_tools/benchmark.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import time +import torch +from mmcv import Config +from mmcv.parallel import MMDataParallel +from mmcv.runner import load_checkpoint, wrap_fp16_model +import sys +sys.path.append('.') +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from projects.mmdet3d_plugin.datasets import custom_build_dataset +# from mmdet3d.datasets import build_dataloader, build_dataset +from mmdet3d.models import build_detector +#from tools.misc.fuse_conv_bn import fuse_module + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMDet benchmark a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('--checkpoint', default=None, help='checkpoint file') + parser.add_argument('--samples', default=2000, help='samples to benchmark') + parser.add_argument( + '--log-interval', default=50, help='interval of logging') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + cfg.model.pretrained = None + cfg.data.test.test_mode = True + + # build the dataloader + # TODO: support multiple images per gpu (only minor changes are needed) + print(cfg.data.test) + dataset = custom_build_dataset(cfg.data.test) + data_loader = build_dataloader( + dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=False, + shuffle=False) + + # build the model and load checkpoint + cfg.model.train_cfg = None + model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + if args.checkpoint is not None: + load_checkpoint(model, args.checkpoint, map_location='cpu') + #if args.fuse_conv_bn: + # model = fuse_module(model) + + model = MMDataParallel(model, device_ids=[0]) + + model.eval() + + # the first several iterations may be very slow so skip them + num_warmup = 5 + pure_inf_time = 0 + + # benchmark with several samples and take the average + for i, data in enumerate(data_loader): + torch.cuda.synchronize() + start_time = time.perf_counter() + with torch.no_grad(): + model(return_loss=False, rescale=True, **data) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - start_time + + if i >= num_warmup: + pure_inf_time += elapsed + if (i + 1) % args.log_interval == 0: + fps = (i + 1 - num_warmup) / pure_inf_time + print(f'Done image [{i + 1:<3}/ {args.samples}], ' + f'fps: {fps:.1f} img / s') + + if (i + 1) == args.samples: + pure_inf_time += elapsed + fps = (i + 1 - num_warmup) / pure_inf_time + print(f'Overall fps: {fps:.1f} img / s') + break + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/analysis_tools/get_params.py b/model_examples/MapTR/tools/analysis_tools/get_params.py new file mode 100644 index 0000000000000000000000000000000000000000..2818900b260f31faf6249b6862b9bb8d80a4ad02 --- /dev/null +++ b/model_examples/MapTR/tools/analysis_tools/get_params.py @@ -0,0 +1,10 @@ +import torch +file_path = './ckpts/bevformer_v4.pth' +model = torch.load(file_path, map_location='cpu') +all = 0 +for key in list(model['state_dict'].keys()): + all += model['state_dict'][key].nelement() +print(all) + +# smaller 63374123 +# v4 69140395 diff --git a/model_examples/MapTR/tools/analysis_tools/visual.py b/model_examples/MapTR/tools/analysis_tools/visual.py new file mode 100644 index 0000000000000000000000000000000000000000..648129897cadb7692cb7fccf18bf59b73d085e3f --- /dev/null +++ b/model_examples/MapTR/tools/analysis_tools/visual.py @@ -0,0 +1,477 @@ +# Based on https://github.com/nutonomy/nuscenes-devkit +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import mmcv +from nuscenes.nuscenes import NuScenes +from PIL import Image +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix +from typing import Tuple, List, Iterable +import matplotlib.pyplot as plt +import numpy as np +from PIL import Image +from matplotlib import rcParams +from matplotlib.axes import Axes +from pyquaternion import Quaternion +from PIL import Image +from matplotlib import rcParams +from matplotlib.axes import Axes +from pyquaternion import Quaternion +from tqdm import tqdm +from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix +from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox +from nuscenes.eval.detection.data_classes import DetectionBox +from nuscenes.eval.detection.utils import category_to_detection_name +from nuscenes.eval.detection.render import visualize_sample + + + + +cams = ['CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_RIGHT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + +import numpy as np +import matplotlib.pyplot as plt +from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box +from PIL import Image +from matplotlib import rcParams + + +def render_annotation( + anntoken: str, + margin: float = 10, + view: np.ndarray = np.eye(4), + box_vis_level: BoxVisibility = BoxVisibility.ANY, + out_path: str = 'render.png', + extra_info: bool = False) -> None: + """ + Render selected annotation. + :param anntoken: Sample_annotation token. + :param margin: How many meters in each direction to include in LIDAR view. + :param view: LIDAR view point. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param out_path: Optional path to save the rendered figure to disk. + :param extra_info: Whether to render extra information below camera view. + """ + ann_record = nusc.get('sample_annotation', anntoken) + sample_record = nusc.get('sample', ann_record['sample_token']) + assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.' + + # Figure out which camera the object is fully visible in (this may return nothing). + boxes, cam = [], [] + cams = [key for key in sample_record['data'].keys() if 'CAM' in key] + all_bboxes = [] + select_cams = [] + for cam in cams: + _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level, + selected_anntokens=[anntoken]) + if len(boxes) > 0: + all_bboxes.append(boxes) + select_cams.append(cam) + # We found an image that matches. Let's abort. + # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \ + # 'Try using e.g. BoxVisibility.ANY.' + # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!' + + num_cam = len(all_bboxes) + + fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9)) + select_cams = [sample_record['data'][cam] for cam in select_cams] + print('bbox in cams:', select_cams) + # Plot LIDAR view. + lidar = sample_record['data']['LIDAR_TOP'] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken]) + LidarPointCloud.from_file(data_path).render_height(axes[0], view=view) + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[0], view=view, colors=(c, c, c)) + corners = view_points(boxes[0].corners(), view, False)[:2, :] + axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin]) + axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin]) + axes[0].axis('off') + axes[0].set_aspect('equal') + + # Plot CAMERA view. + for i in range(1, num_cam + 1): + cam = select_cams[i - 1] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken]) + im = Image.open(data_path) + axes[i].imshow(im) + axes[i].set_title(nusc.get('sample_data', cam)['channel']) + axes[i].axis('off') + axes[i].set_aspect('equal') + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + + # Print extra information about the annotation below the camera view. + axes[i].set_xlim(0, im.size[0]) + axes[i].set_ylim(im.size[1], 0) + + if extra_info: + rcParams['font.family'] = 'monospace' + + w, l, h = ann_record['size'] + category = ann_record['category_name'] + lidar_points = ann_record['num_lidar_pts'] + radar_points = ann_record['num_radar_pts'] + + sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP']) + pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token']) + dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation'])) + + information = ' \n'.join(['category: {}'.format(category), + '', + '# lidar points: {0:>4}'.format(lidar_points), + '# radar points: {0:>4}'.format(radar_points), + '', + 'distance: {:>7.3f}m'.format(dist), + '', + 'width: {:>7.3f}m'.format(w), + 'length: {:>7.3f}m'.format(l), + 'height: {:>7.3f}m'.format(h)]) + + plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top') + + if out_path is not None: + plt.savefig(out_path) + + + +def get_sample_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + if selected_anntokens is not None: + boxes = list(map(nusc.get_box, selected_anntokens)) + else: + boxes = nusc.get_boxes(sample_data_token) + + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + + +def get_predicted_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False, + pred_anns=None + ): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + # if selected_anntokens is not None: + # boxes = list(map(nusc.get_box, selected_anntokens)) + # else: + # boxes = nusc.get_boxes(sample_data_token) + boxes = pred_anns + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + + + +def lidiar_render(sample_token, data,out_path=None): + bbox_gt_list = [] + bbox_pred_list = [] + anns = nusc.get('sample', sample_token)['anns'] + for ann in anns: + content = nusc.get('sample_annotation', ann) + try: + bbox_gt_list.append(DetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=nusc.box_velocity(content['token'])[:2], + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=category_to_detection_name(content['category_name']), + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name='')) + except: + pass + + bbox_anns = data['results'][sample_token] + for content in bbox_anns: + bbox_pred_list.append(DetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name'])) + gt_annotations = EvalBoxes() + pred_annotations = EvalBoxes() + gt_annotations.add_boxes(sample_token, bbox_gt_list) + pred_annotations.add_boxes(sample_token, bbox_pred_list) + print('green is ground truth') + print('blue is the predited result') + visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, savepath=out_path+'_bev') + + +def get_color(category_name: str): + """ + Provides the default colors based on the category names. + This method works for the general nuScenes categories, as well as the nuScenes detection categories. + """ + a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker', + 'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller', + 'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris', + 'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle', + 'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance', + 'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface', + 'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation', + 'vehicle.ego'] + class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' + ] + #print(category_name) + if category_name == 'bicycle': + return nusc.colormap['vehicle.bicycle'] + elif category_name == 'construction_vehicle': + return nusc.colormap['vehicle.construction'] + elif category_name == 'traffic_cone': + return nusc.colormap['movable_object.trafficcone'] + + for key in nusc.colormap.keys(): + if category_name in key: + return nusc.colormap[key] + return [0, 0, 0] + + +def render_sample_data( + sample_toekn: str, + with_anns: bool = True, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + axes_limit: float = 40, + ax=None, + nsweeps: int = 1, + out_path: str = None, + underlay_map: bool = True, + use_flat_vehicle_coordinates: bool = True, + show_lidarseg: bool = False, + show_lidarseg_legend: bool = False, + filter_lidarseg_labels=None, + lidarseg_preds_bin_path: str = None, + verbose: bool = True, + show_panoptic: bool = False, + pred_data=None, + ) -> None: + """ + Render sample data onto axis. + :param sample_data_token: Sample_data token. + :param with_anns: Whether to draw box annotations. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param axes_limit: Axes limit for lidar and radar (measured in meters). + :param ax: Axes onto which to render. + :param nsweeps: Number of sweeps for lidar and radar. + :param out_path: Optional path to save the rendered figure to disk. + :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which + can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new + setting is more correct and rotates the plot by ~90 degrees. + :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame. + :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None + or the list is empty, all classes will be displayed. + :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation + predictions for the sample. + :param verbose: Whether to display the image after it is rendered. + :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + If show_lidarseg is True, show_panoptic will be set to False. + """ + lidiar_render(sample_toekn, pred_data, out_path=out_path) + sample = nusc.get('sample', sample_toekn) + # sample = data['results'][sample_token_list[0]][0] + cams = [ + 'CAM_FRONT_LEFT', + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_LEFT', + 'CAM_BACK', + 'CAM_BACK_RIGHT', + ] + if ax is None: + _, ax = plt.subplots(4, 3, figsize=(24, 18)) + j = 0 + for ind, cam in enumerate(cams): + sample_data_token = sample['data'][cam] + + sd_record = nusc.get('sample_data', sample_data_token) + sensor_modality = sd_record['sensor_modality'] + + if sensor_modality in ['lidar', 'radar']: + assert False + elif sensor_modality == 'camera': + # Load boxes and image. + boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']), + name=record['detection_name'], token='predicted') for record in + pred_data['results'][sample_toekn] if record['detection_score'] > 0.2] + + data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token, + box_vis_level=box_vis_level, pred_anns=boxes) + _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=box_vis_level) + if ind == 3: + j += 1 + ind = ind % 3 + data = Image.open(data_path) + # mmcv.imwrite(np.array(data)[:,:,::-1], f'{cam}.png') + # Init axes. + + # Show image. + ax[j, ind].imshow(data) + ax[j + 2, ind].imshow(data) + + # Show boxes. + if with_anns: + for box in boxes_pred: + c = np.array(get_color(box.name)) / 255.0 + box.render(ax[j, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + for box in boxes_gt: + c = np.array(get_color(box.name)) / 255.0 + box.render(ax[j + 2, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + + # Limit visible range. + ax[j, ind].set_xlim(0, data.size[0]) + ax[j, ind].set_ylim(data.size[1], 0) + ax[j + 2, ind].set_xlim(0, data.size[0]) + ax[j + 2, ind].set_ylim(data.size[1], 0) + + else: + raise ValueError("Error: Unknown sensor modality!") + + ax[j, ind].axis('off') + ax[j, ind].set_title('PRED: {} {labels_type}'.format( + sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else '')) + ax[j, ind].set_aspect('equal') + + ax[j + 2, ind].axis('off') + ax[j + 2, ind].set_title('GT:{} {labels_type}'.format( + sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else '')) + ax[j + 2, ind].set_aspect('equal') + + if out_path is not None: + plt.savefig(out_path+'_camera', bbox_inches='tight', pad_inches=0, dpi=200) + if verbose: + plt.show() + plt.close() + +if __name__ == '__main__': + nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True) + # render_annotation('7603b030b42a4b1caa8c443ccc1a7d52') + bevformer_results = mmcv.load('test/bevformer_base/Thu_Jun__9_16_22_37_2022/pts_bbox/results_nusc.json') + sample_token_list = list(bevformer_results['results'].keys()) + for id in range(0, 10): + render_sample_data(sample_token_list[id], pred_data=bevformer_results, out_path=sample_token_list[id]) diff --git a/model_examples/MapTR/tools/create_data.py b/model_examples/MapTR/tools/create_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f2b0cc10f1fafa77a39cd8fbd9c1ac9386d2af72 --- /dev/null +++ b/model_examples/MapTR/tools/create_data.py @@ -0,0 +1,305 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +from data_converter.create_gt_database import create_groundtruth_database +from data_converter import nuscenes_converter as nuscenes_converter +from data_converter import lyft_converter as lyft_converter +from data_converter import kitti_converter as kitti +from data_converter import indoor_converter as indoor +import argparse +from os import path as osp +import sys +sys.path.append('.') + + +def kitti_data_prep(root_path, info_prefix, version, out_dir): + """Prepare data related to Kitti dataset. + + Related data consists of '.pkl' files recording basic infos, + 2D annotations and groundtruth database. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + version (str): Dataset version. + out_dir (str): Output directory of the groundtruth database info. + """ + kitti.create_kitti_info_file(root_path, info_prefix) + kitti.create_reduced_point_cloud(root_path, info_prefix) + + info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl') + info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl') + info_trainval_path = osp.join(root_path, + f'{info_prefix}_infos_trainval.pkl') + info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl') + kitti.export_2d_annotation(root_path, info_train_path) + kitti.export_2d_annotation(root_path, info_val_path) + kitti.export_2d_annotation(root_path, info_trainval_path) + kitti.export_2d_annotation(root_path, info_test_path) + + create_groundtruth_database( + 'KittiDataset', + root_path, + info_prefix, + f'{out_dir}/{info_prefix}_infos_train.pkl', + relative_path=False, + mask_anno_path='instances_train.json', + with_mask=(version == 'mask')) + + +def nuscenes_data_prep(root_path, + can_bus_root_path, + info_prefix, + version, + dataset_name, + out_dir, + max_sweeps=10): + """Prepare data related to nuScenes dataset. + + Related data consists of '.pkl' files recording basic infos, + 2D annotations and groundtruth database. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + version (str): Dataset version. + dataset_name (str): The dataset class name. + out_dir (str): Output directory of the groundtruth database info. + max_sweeps (int): Number of input consecutive frames. Default: 10 + """ + nuscenes_converter.create_nuscenes_infos( + root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps) + + if version == 'v1.0-test': + info_test_path = osp.join( + out_dir, f'{info_prefix}_infos_temporal_test.pkl') + nuscenes_converter.export_2d_annotation( + root_path, info_test_path, version=version) + else: + info_train_path = osp.join( + out_dir, f'{info_prefix}_infos_temporal_train.pkl') + info_val_path = osp.join( + out_dir, f'{info_prefix}_infos_temporal_val.pkl') + nuscenes_converter.export_2d_annotation( + root_path, info_train_path, version=version) + nuscenes_converter.export_2d_annotation( + root_path, info_val_path, version=version) + # create_groundtruth_database(dataset_name, root_path, info_prefix, + # f'{out_dir}/{info_prefix}_infos_train.pkl') + + +def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10): + """Prepare data related to Lyft dataset. + + Related data consists of '.pkl' files recording basic infos. + Although the ground truth database and 2D annotations are not used in + Lyft, it can also be generated like nuScenes. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + version (str): Dataset version. + max_sweeps (int, optional): Number of input consecutive frames. + Defaults to 10. + """ + lyft_converter.create_lyft_infos( + root_path, info_prefix, version=version, max_sweeps=max_sweeps) + + +def scannet_data_prep(root_path, info_prefix, out_dir, workers): + """Prepare the info file for scannet dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + """ + indoor.create_indoor_info_file( + root_path, info_prefix, out_dir, workers=workers) + + +def s3dis_data_prep(root_path, info_prefix, out_dir, workers): + """Prepare the info file for s3dis dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + """ + indoor.create_indoor_info_file( + root_path, info_prefix, out_dir, workers=workers) + + +def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers): + """Prepare the info file for sunrgbd dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + """ + indoor.create_indoor_info_file( + root_path, info_prefix, out_dir, workers=workers) + + +def waymo_data_prep(root_path, + info_prefix, + version, + out_dir, + workers, + max_sweeps=5): + """Prepare the info file for waymo dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + max_sweeps (int): Number of input consecutive frames. Default: 5 \ + Here we store pose information of these frames for later use. + """ + from tools.data_converter import waymo_converter as waymo + + splits = ['training', 'validation', 'testing'] + + for i, split in enumerate(splits): + load_dir = osp.join(root_path, 'waymo_format', split) + if split == 'validation': + save_dir = osp.join(out_dir, 'kitti_format', 'training') + else: + save_dir = osp.join(out_dir, 'kitti_format', split) + converter = waymo.Waymo2KITTI( + load_dir, + save_dir, + prefix=str(i), + workers=workers, + test_mode=(split == 'test')) + converter.convert() + # Generate waymo infos + out_dir = osp.join(out_dir, 'kitti_format') + kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps) + + create_groundtruth_database( + 'WaymoDataset', + out_dir, + info_prefix, + f'{out_dir}/{info_prefix}_infos_train.pkl', + relative_path=False, + with_mask=False) + + +parser = argparse.ArgumentParser(description='Data converter arg parser') +parser.add_argument('dataset', metavar='kitti', help='name of the dataset') +parser.add_argument( + '--root-path', + type=str, + default='./data/kitti', + help='specify the root path of dataset') +parser.add_argument( + '--canbus', + type=str, + default='./data', + help='specify the root path of nuScenes canbus') +parser.add_argument( + '--version', + type=str, + default='v1.0', + required=False, + help='specify the dataset version, no need for kitti') +parser.add_argument( + '--max-sweeps', + type=int, + default=10, + required=False, + help='specify sweeps of lidar per example') +parser.add_argument( + '--out-dir', + type=str, + default='./data/kitti', + required='False', + help='name of info pkl') +parser.add_argument('--extra-tag', type=str, default='kitti') +parser.add_argument( + '--workers', type=int, default=4, help='number of threads to be used') +args = parser.parse_args() + +if __name__ == '__main__': + if args.dataset == 'kitti': + kitti_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=args.version, + out_dir=args.out_dir) + elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini': + train_version = f'{args.version}-trainval' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=train_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + test_version = f'{args.version}-test' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=test_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini': + train_version = f'{args.version}' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=train_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + elif args.dataset == 'lyft': + train_version = f'{args.version}-train' + lyft_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=train_version, + max_sweeps=args.max_sweeps) + test_version = f'{args.version}-test' + lyft_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=test_version, + max_sweeps=args.max_sweeps) + elif args.dataset == 'waymo': + waymo_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=args.version, + out_dir=args.out_dir, + workers=args.workers, + max_sweeps=args.max_sweeps) + elif args.dataset == 'scannet': + scannet_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + out_dir=args.out_dir, + workers=args.workers) + elif args.dataset == 's3dis': + s3dis_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + out_dir=args.out_dir, + workers=args.workers) + elif args.dataset == 'sunrgbd': + sunrgbd_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + out_dir=args.out_dir, + workers=args.workers) diff --git a/model_examples/MapTR/tools/data_converter/__init__.py b/model_examples/MapTR/tools/data_converter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/model_examples/MapTR/tools/data_converter/av2_converter.py b/model_examples/MapTR/tools/data_converter/av2_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..024e33399bf460afeceeb52aa857de975505afc5 --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/av2_converter.py @@ -0,0 +1,204 @@ +from functools import partial +from multiprocessing import Pool +import multiprocessing +from random import sample +import time +import mmcv +import logging +from pathlib import Path +from os import path as osp +import os +from av2.datasets.sensor.av2_sensor_dataloader import AV2SensorDataLoader +from av2.map.lane_segment import LaneMarkType, LaneSegment +from av2.map.map_api import ArgoverseStaticMap +from tqdm import tqdm +import argparse + +CAM_NAMES = ['ring_front_center', 'ring_front_right', 'ring_front_left', + 'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left', + # 'stereo_front_left', 'stereo_front_right', + ] +# some fail logs as stated in av2 +# https://github.com/argoverse/av2-api/blob/05b7b661b7373adb5115cf13378d344d2ee43906/src/av2/map/README.md#training-online-map-inference-models +FAIL_LOGS = [ + '75e8adad-50a6-3245-8726-5e612db3d165', + '54bc6dbc-ebfb-3fba-b5b3-57f88b4b79ca', + 'af170aac-8465-3d7b-82c5-64147e94af7d', + '6e106cf8-f6dd-38f6-89c8-9be7a71e7275', + '01bb304d-7bd8-35f8-bbef-7086b688e35e', + '453e5558-6363-38e3-bf9b-42b5ba0a6f1d' +] + +def parse_args(): + parser = argparse.ArgumentParser(description='Data converter arg parser') + parser.add_argument( + '--data-root', + type=str, + help='specify the root path of dataset') + parser.add_argument( + '--nproc', + type=int, + default=64, + required=False, + help='workers to process data') + args_p = parser.parse_args() + return args_p + +def create_av2_infos_mp(root_path, + info_prefix, + dest_path=None, + split='train', + num_multithread=64): + """Create info file of av2 dataset. + + Given the raw data, generate its related info file in pkl format. + + Args: + root_path (str): Path of the data root. + info_prefix (str): Prefix of the info file to be generated. + dest_path (str): Path to store generated file, default to root_path + split (str): Split of the data. + Default: 'train' + """ + root_path = osp.join(root_path, split) + if dest_path is None: + dest_path = root_path + + loader = AV2SensorDataLoader(Path(root_path), Path(root_path)) + log_ids = list(loader.get_log_ids()) + # import pdb;pdb.set_trace() + for l in FAIL_LOGS: + if l in log_ids: + log_ids.remove(l) + + print('collecting samples...') + start_time = time.time() + print('num cpu:', multiprocessing.cpu_count()) + print(f'using {num_multithread} threads') + + # to supress logging from av2.utils.synchronization_database + sdb_logger = logging.getLogger('av2.utils.synchronization_database') + prev_level = sdb_logger.level + sdb_logger.setLevel(logging.CRITICAL) + + # FIXME: need to check the order + pool = Pool(num_multithread) + fn = partial(get_data_from_logid, loader=loader, data_root=root_path) + rt = pool.map_async(fn, log_ids) + pool.close() + pool.join() + results = rt.get() + + samples = [] + discarded = 0 + sample_idx = 0 + for _samples, _discarded in results: + for i in range(len(_samples)): + _samples[i]['sample_idx'] = sample_idx + sample_idx += 1 + samples += _samples + discarded += _discarded + + sdb_logger.setLevel(prev_level) + print(f'{len(samples)} available samples, {discarded} samples discarded') + + id2map = {} + for log_id in log_ids: + log_map_dirpath = Path(osp.join(root_path, log_id, "map")) + vector_data_fnames = sorted(log_map_dirpath.glob("log_map_archive_*.json")) + # vector_data_fnames = sorted(log_map_dirpath.glob("log_map_archive_*.json")) + if not len(vector_data_fnames) == 1: + raise RuntimeError(f"JSON file containing vector map data is missing (searched in {log_map_dirpath})") + vector_data_fname = vector_data_fnames[0] + vector_data_json_path = vector_data_fname + avm = ArgoverseStaticMap.from_json(vector_data_json_path) + # import pdb;pdb.set_trace() + map_elements = {} + map_elements['divider'] = get_divider(avm) + map_elements['ped_crossing'] = get_ped(avm) + map_elements['boundary'] = get_boundary(avm) + + # map_fname = osp.join(map_path_dir, map_fname) + id2map[log_id] = map_elements + + print('collected in {}s'.format(time.time()-start_time)) + infos = dict(samples=samples, id2map=id2map) + + info_path = osp.join(dest_path, + '{}_map_infos_{}.pkl'.format(info_prefix, split)) + print(f'saving results to {info_path}') + mmcv.dump(infos, info_path) + # mmcv.dump(samples, info_path) + +def get_divider(avm): + divider_list = [] + for ls in avm.get_scenario_lane_segments(): + for bound_type, bound_city in zip([ls.left_mark_type, ls.right_mark_type], [ls.left_lane_boundary, ls.right_lane_boundary]): + if bound_type not in [LaneMarkType.NONE,]: + divider_list.append(bound_city.xyz) + return divider_list + +def get_boundary(avm): + boundary_list = [] + for da in avm.get_scenario_vector_drivable_areas(): + boundary_list.append(da.xyz) + return boundary_list + +def get_ped(avm): + ped_list = [] + for pc in avm.get_scenario_ped_crossings(): + ped_list.append(pc.polygon) + return ped_list + +def get_data_from_logid(log_id, loader: AV2SensorDataLoader, data_root): + samples = [] + discarded = 0 + + # We use lidar timestamps to query all sensors. + # The frequency is 10Hz + cam_timestamps = loader._sdb.per_log_lidar_timestamps_index[log_id] + for ts in cam_timestamps: + cam_ring_fpath = [loader.get_closest_img_fpath( + log_id, cam_name, ts + ) for cam_name in CAM_NAMES] + lidar_fpath = loader.get_closest_lidar_fpath(log_id, ts) + + # If bad sensor synchronization, discard the sample + if None in cam_ring_fpath or lidar_fpath is None: + discarded += 1 + continue + + cams = {} + for i, cam_name in enumerate(CAM_NAMES): + pinhole_cam = loader.get_log_pinhole_camera(log_id, cam_name) + cams[cam_name] = dict( + img_fpath=str(cam_ring_fpath[i]), + intrinsics=pinhole_cam.intrinsics.K, + extrinsics=pinhole_cam.extrinsics, + ) + + city_SE3_ego = loader.get_city_SE3_ego(log_id, int(ts)) + e2g_translation = city_SE3_ego.translation + e2g_rotation = city_SE3_ego.rotation + + samples.append(dict( + e2g_translation=e2g_translation, + e2g_rotation=e2g_rotation, + cams=cams, + lidar_fpath=str(lidar_fpath), + # map_fpath=map_fname, + timestamp=str(ts), + log_id=log_id, + token=str(log_id+'_'+str(ts)))) + + return samples, discarded + + +if __name__ == '__main__': + args = parse_args() + for name in ['train', 'val', 'test']: + create_av2_infos_mp( + root_path=args.data_root, + split=name, + info_prefix='av2', + dest_path=args.data_root,) \ No newline at end of file diff --git a/model_examples/MapTR/tools/data_converter/create_gt_database.py b/model_examples/MapTR/tools/data_converter/create_gt_database.py new file mode 100644 index 0000000000000000000000000000000000000000..7317cedd08377643018b7d4a72f7b5c96397b59c --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/create_gt_database.py @@ -0,0 +1,338 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +import pickle +from mmcv import track_iter_progress +from mmcv.ops import roi_align +from os import path as osp +from pycocotools import mask as maskUtils +from pycocotools.coco import COCO + +from mmdet3d.core.bbox import box_np_ops as box_np_ops +from mmdet3d.datasets import build_dataset +from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps + + +def _poly2mask(mask_ann, img_h, img_w): + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) + rle = maskUtils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = maskUtils.decode(rle) + return mask + + +def _parse_coco_ann_info(ann_info): + gt_bboxes = [] + gt_labels = [] + gt_bboxes_ignore = [] + gt_masks_ann = [] + + for i, ann in enumerate(ann_info): + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + if ann['area'] <= 0: + continue + bbox = [x1, y1, x1 + w, y1 + h] + if ann.get('iscrowd', False): + gt_bboxes_ignore.append(bbox) + else: + gt_bboxes.append(bbox) + gt_masks_ann.append(ann['segmentation']) + + if gt_bboxes: + gt_bboxes = np.array(gt_bboxes, dtype=np.float32) + gt_labels = np.array(gt_labels, dtype=np.int64) + else: + gt_bboxes = np.zeros((0, 4), dtype=np.float32) + gt_labels = np.array([], dtype=np.int64) + + if gt_bboxes_ignore: + gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) + else: + gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) + + ann = dict( + bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann) + + return ann + + +def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks): + import torch + from torch.nn.modules.utils import _pair + device = pos_proposals.device + num_pos = pos_proposals.size(0) + fake_inds = ( + torch.arange(num_pos, + device=device).to(dtype=pos_proposals.dtype)[:, None]) + rois = torch.cat([fake_inds, pos_proposals], dim=1) # Nx5 + mask_size = _pair(28) + rois = rois.to(device=device) + gt_masks_th = ( + torch.from_numpy(gt_masks).to(device).index_select( + 0, pos_assigned_gt_inds).to(dtype=rois.dtype)) + # Use RoIAlign could apparently accelerate the training (~0.1s/iter) + targets = ( + roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1)) + return targets + + +def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img): + num_pos = pos_proposals.shape[0] + masks = [] + img_patches = [] + for i in range(num_pos): + gt_mask = gt_masks[pos_assigned_gt_inds[i]] + bbox = pos_proposals[i, :].astype(np.int32) + x1, y1, x2, y2 = bbox + w = np.maximum(x2 - x1 + 1, 1) + h = np.maximum(y2 - y1 + 1, 1) + + mask_patch = gt_mask[y1:y1 + h, x1:x1 + w] + masked_img = gt_mask[..., None] * org_img + img_patch = masked_img[y1:y1 + h, x1:x1 + w] + + img_patches.append(img_patch) + masks.append(mask_patch) + return img_patches, masks + + +def create_groundtruth_database(dataset_class_name, + data_path, + info_prefix, + info_path=None, + mask_anno_path=None, + used_classes=None, + database_save_path=None, + db_info_save_path=None, + relative_path=True, + add_rgb=False, + lidar_only=False, + bev_only=False, + coors_range=None, + with_mask=False): + """Given the raw data, generate the ground truth database. + + Args: + dataset_class_name (str): Name of the input dataset. + data_path (str): Path of the data. + info_prefix (str): Prefix of the info file. + info_path (str): Path of the info file. + Default: None. + mask_anno_path (str): Path of the mask_anno. + Default: None. + used_classes (list[str]): Classes have been used. + Default: None. + database_save_path (str): Path to save database. + Default: None. + db_info_save_path (str): Path to save db_info. + Default: None. + relative_path (bool): Whether to use relative path. + Default: True. + with_mask (bool): Whether to use mask. + Default: False. + """ + print(f'Create GT Database of {dataset_class_name}') + dataset_cfg = dict( + type=dataset_class_name, data_root=data_path, ann_file=info_path) + if dataset_class_name == 'KittiDataset': + file_client_args = dict(backend='disk') + dataset_cfg.update( + test_mode=False, + split='training', + modality=dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=with_mask, + ), + pipeline=[ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args) + ]) + + elif dataset_class_name == 'NuScenesDataset': + dataset_cfg.update( + use_valid_flag=True, + pipeline=[ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + pad_empty_sweeps=True, + remove_close=True), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True) + ]) + + elif dataset_class_name == 'WaymoDataset': + file_client_args = dict(backend='disk') + dataset_cfg.update( + test_mode=False, + split='training', + modality=dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=False, + ), + pipeline=[ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args) + ]) + + dataset = build_dataset(dataset_cfg) + + if database_save_path is None: + database_save_path = osp.join(data_path, f'{info_prefix}_gt_database') + if db_info_save_path is None: + db_info_save_path = osp.join(data_path, + f'{info_prefix}_dbinfos_train.pkl') + mmcv.mkdir_or_exist(database_save_path) + all_db_infos = dict() + if with_mask: + coco = COCO(osp.join(data_path, mask_anno_path)) + imgIds = coco.getImgIds() + file2id = dict() + for i in imgIds: + info = coco.loadImgs([i])[0] + file2id.update({info['file_name']: i}) + + group_counter = 0 + for j in track_iter_progress(list(range(len(dataset)))): + input_dict = dataset.get_data_info(j) + dataset.pre_pipeline(input_dict) + example = dataset.pipeline(input_dict) + annos = example['ann_info'] + image_idx = example['sample_idx'] + points = example['points'].tensor.numpy() + gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy() + names = annos['gt_names'] + group_dict = dict() + if 'group_ids' in annos: + group_ids = annos['group_ids'] + else: + group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64) + difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32) + if 'difficulty' in annos: + difficulty = annos['difficulty'] + + num_obj = gt_boxes_3d.shape[0] + point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d) + + if with_mask: + # prepare masks + gt_boxes = annos['gt_bboxes'] + img_path = osp.split(example['img_info']['filename'])[-1] + if img_path not in file2id.keys(): + print(f'skip image {img_path} for empty mask') + continue + img_id = file2id[img_path] + kins_annIds = coco.getAnnIds(imgIds=img_id) + kins_raw_info = coco.loadAnns(kins_annIds) + kins_ann_info = _parse_coco_ann_info(kins_raw_info) + h, w = annos['img_shape'][:2] + gt_masks = [ + _poly2mask(mask, h, w) for mask in kins_ann_info['masks'] + ] + # get mask inds based on iou mapping + bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes) + mask_inds = bbox_iou.argmax(axis=0) + valid_inds = (bbox_iou.max(axis=0) > 0.5) + + # mask the image + # use more precise crop when it is ready + # object_img_patches = np.ascontiguousarray( + # np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2)) + # crop image patches using roi_align + # object_img_patches = crop_image_patch_v2( + # torch.Tensor(gt_boxes), + # torch.Tensor(mask_inds).long(), object_img_patches) + object_img_patches, object_masks = crop_image_patch( + gt_boxes, gt_masks, mask_inds, annos['img']) + + for i in range(num_obj): + filename = f'{image_idx}_{names[i]}_{i}.bin' + abs_filepath = osp.join(database_save_path, filename) + rel_filepath = osp.join(f'{info_prefix}_gt_database', filename) + + # save point clouds and image patches for each object + gt_points = points[point_indices[:, i]] + gt_points[:, :3] -= gt_boxes_3d[i, :3] + + if with_mask: + if object_masks[i].sum() == 0 or not valid_inds[i]: + # Skip object for empty or invalid mask + continue + img_patch_path = abs_filepath + '.png' + mask_patch_path = abs_filepath + '.mask.png' + mmcv.imwrite(object_img_patches[i], img_patch_path) + mmcv.imwrite(object_masks[i], mask_patch_path) + + with open(abs_filepath, 'w') as f: + gt_points.tofile(f) + + if (used_classes is None) or names[i] in used_classes: + db_info = { + 'name': names[i], + 'path': rel_filepath, + 'image_idx': image_idx, + 'gt_idx': i, + 'box3d_lidar': gt_boxes_3d[i], + 'num_points_in_gt': gt_points.shape[0], + 'difficulty': difficulty[i], + } + local_group_id = group_ids[i] + # if local_group_id >= 0: + if local_group_id not in group_dict: + group_dict[local_group_id] = group_counter + group_counter += 1 + db_info['group_id'] = group_dict[local_group_id] + if 'score' in annos: + db_info['score'] = annos['score'][i] + if with_mask: + db_info.update({'box2d_camera': gt_boxes[i]}) + if names[i] in all_db_infos: + all_db_infos[names[i]].append(db_info) + else: + all_db_infos[names[i]] = [db_info] + + for k, v in all_db_infos.items(): + print(f'load {len(v)} {k} database infos') + + with open(db_info_save_path, 'wb') as f: + pickle.dump(all_db_infos, f) diff --git a/model_examples/MapTR/tools/data_converter/indoor_converter.py b/model_examples/MapTR/tools/data_converter/indoor_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..4072397605992869f63889c1b8d1dda5ad44817c --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/indoor_converter.py @@ -0,0 +1,108 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +import os + +from tools.data_converter.s3dis_data_utils import S3DISData, S3DISSegData +from tools.data_converter.scannet_data_utils import ScanNetData, ScanNetSegData +from tools.data_converter.sunrgbd_data_utils import SUNRGBDData + + +def create_indoor_info_file(data_path, + pkl_prefix='sunrgbd', + save_path=None, + use_v1=False, + workers=4): + """Create indoor information file. + + Get information of the raw data and save it to the pkl file. + + Args: + data_path (str): Path of the data. + pkl_prefix (str): Prefix of the pkl to be saved. Default: 'sunrgbd'. + save_path (str): Path of the pkl to be saved. Default: None. + use_v1 (bool): Whether to use v1. Default: False. + workers (int): Number of threads to be used. Default: 4. + """ + assert os.path.exists(data_path) + assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \ + f'unsupported indoor dataset {pkl_prefix}' + save_path = data_path if save_path is None else save_path + assert os.path.exists(save_path) + + # generate infos for both detection and segmentation task + if pkl_prefix in ['sunrgbd', 'scannet']: + train_filename = os.path.join(save_path, + f'{pkl_prefix}_infos_train.pkl') + val_filename = os.path.join(save_path, f'{pkl_prefix}_infos_val.pkl') + if pkl_prefix == 'sunrgbd': + # SUN RGB-D has a train-val split + train_dataset = SUNRGBDData( + root_path=data_path, split='train', use_v1=use_v1) + val_dataset = SUNRGBDData( + root_path=data_path, split='val', use_v1=use_v1) + else: + # ScanNet has a train-val-test split + train_dataset = ScanNetData(root_path=data_path, split='train') + val_dataset = ScanNetData(root_path=data_path, split='val') + test_dataset = ScanNetData(root_path=data_path, split='test') + test_filename = os.path.join(save_path, + f'{pkl_prefix}_infos_test.pkl') + + infos_train = train_dataset.get_infos( + num_workers=workers, has_label=True) + mmcv.dump(infos_train, train_filename, 'pkl') + print(f'{pkl_prefix} info train file is saved to {train_filename}') + + infos_val = val_dataset.get_infos(num_workers=workers, has_label=True) + mmcv.dump(infos_val, val_filename, 'pkl') + print(f'{pkl_prefix} info val file is saved to {val_filename}') + + if pkl_prefix == 'scannet': + infos_test = test_dataset.get_infos( + num_workers=workers, has_label=False) + mmcv.dump(infos_test, test_filename, 'pkl') + print(f'{pkl_prefix} info test file is saved to {test_filename}') + + # generate infos for the semantic segmentation task + # e.g. re-sampled scene indexes and label weights + # scene indexes are used to re-sample rooms with different number of points + # label weights are used to balance classes with different number of points + if pkl_prefix == 'scannet': + # label weight computation function is adopted from + # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24 + train_dataset = ScanNetSegData( + data_root=data_path, + ann_file=train_filename, + split='train', + num_points=8192, + label_weight_func=lambda x: 1.0 / np.log(1.2 + x)) + # TODO: do we need to generate on val set? + val_dataset = ScanNetSegData( + data_root=data_path, + ann_file=val_filename, + split='val', + num_points=8192, + label_weight_func=lambda x: 1.0 / np.log(1.2 + x)) + # no need to generate for test set + train_dataset.get_seg_infos() + val_dataset.get_seg_infos() + elif pkl_prefix == 's3dis': + # S3DIS doesn't have a fixed train-val split + # it has 6 areas instead, so we generate info file for each of them + # in training, we will use dataset to wrap different areas + splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]] + for split in splits: + dataset = S3DISData(root_path=data_path, split=split) + info = dataset.get_infos(num_workers=workers, has_label=True) + filename = os.path.join(save_path, + f'{pkl_prefix}_infos_{split}.pkl') + mmcv.dump(info, filename, 'pkl') + print(f'{pkl_prefix} info {split} file is saved to {filename}') + seg_dataset = S3DISSegData( + data_root=data_path, + ann_file=filename, + split=split, + num_points=4096, + label_weight_func=lambda x: 1.0 / np.log(1.2 + x)) + seg_dataset.get_seg_infos() diff --git a/model_examples/MapTR/tools/data_converter/kitti_converter.py b/model_examples/MapTR/tools/data_converter/kitti_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..47eec6abc0de1ee62283b056d901ef6b4c592960 --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/kitti_converter.py @@ -0,0 +1,546 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +from collections import OrderedDict +from nuscenes.utils.geometry_utils import view_points +from pathlib import Path + +from mmdet3d.core.bbox import box_np_ops +from .kitti_data_utils import get_kitti_image_info, get_waymo_image_info +from .nuscenes_converter import post_process_coords + +kitti_categories = ('Pedestrian', 'Cyclist', 'Car') + + +def convert_to_kitti_info_version2(info): + """convert kitti info v1 to v2 if possible. + + Args: + info (dict): Info of the input kitti data. + - image (dict): image info + - calib (dict): calibration info + - point_cloud (dict): point cloud info + """ + if 'image' not in info or 'calib' not in info or 'point_cloud' not in info: + info['image'] = { + 'image_shape': info['img_shape'], + 'image_idx': info['image_idx'], + 'image_path': info['img_path'], + } + info['calib'] = { + 'R0_rect': info['calib/R0_rect'], + 'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'], + 'P2': info['calib/P2'], + } + info['point_cloud'] = { + 'velodyne_path': info['velodyne_path'], + } + + +def _read_imageset_file(path): + with open(path, 'r') as f: + lines = f.readlines() + return [int(line) for line in lines] + + +def _calculate_num_points_in_gt(data_path, + infos, + relative_path, + remove_outside=True, + num_features=4): + for info in mmcv.track_iter_progress(infos): + pc_info = info['point_cloud'] + image_info = info['image'] + calib = info['calib'] + if relative_path: + v_path = str(Path(data_path) / pc_info['velodyne_path']) + else: + v_path = pc_info['velodyne_path'] + points_v = np.fromfile( + v_path, dtype=np.float32, count=-1).reshape([-1, num_features]) + rect = calib['R0_rect'] + Trv2c = calib['Tr_velo_to_cam'] + P2 = calib['P2'] + if remove_outside: + points_v = box_np_ops.remove_outside_points( + points_v, rect, Trv2c, P2, image_info['image_shape']) + + # points_v = points_v[points_v[:, 0] > 0] + annos = info['annos'] + num_obj = len([n for n in annos['name'] if n != 'DontCare']) + # annos = kitti.filter_kitti_anno(annos, ['DontCare']) + dims = annos['dimensions'][:num_obj] + loc = annos['location'][:num_obj] + rots = annos['rotation_y'][:num_obj] + gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]], + axis=1) + gt_boxes_lidar = box_np_ops.box_camera_to_lidar( + gt_boxes_camera, rect, Trv2c) + indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar) + num_points_in_gt = indices.sum(0) + num_ignored = len(annos['dimensions']) - num_obj + num_points_in_gt = np.concatenate( + [num_points_in_gt, -np.ones([num_ignored])]) + annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32) + + +def create_kitti_info_file(data_path, + pkl_prefix='kitti', + save_path=None, + relative_path=True): + """Create info file of KITTI dataset. + + Given the raw data, generate its related info file in pkl format. + + Args: + data_path (str): Path of the data root. + pkl_prefix (str): Prefix of the info file to be generated. + save_path (str): Path to save the info file. + relative_path (bool): Whether to use relative path. + """ + imageset_folder = Path(data_path) / 'ImageSets' + train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt')) + + val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt')) + test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt')) + + print('Generate info. this may take several minutes.') + if save_path is None: + save_path = Path(data_path) + else: + save_path = Path(save_path) + kitti_infos_train = get_kitti_image_info( + data_path, + training=True, + velodyne=True, + calib=True, + image_ids=train_img_ids, + relative_path=relative_path) + _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path) + filename = save_path / f'{pkl_prefix}_infos_train.pkl' + print(f'Kitti info train file is saved to {filename}') + mmcv.dump(kitti_infos_train, filename) + kitti_infos_val = get_kitti_image_info( + data_path, + training=True, + velodyne=True, + calib=True, + image_ids=val_img_ids, + relative_path=relative_path) + _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path) + filename = save_path / f'{pkl_prefix}_infos_val.pkl' + print(f'Kitti info val file is saved to {filename}') + mmcv.dump(kitti_infos_val, filename) + filename = save_path / f'{pkl_prefix}_infos_trainval.pkl' + print(f'Kitti info trainval file is saved to {filename}') + mmcv.dump(kitti_infos_train + kitti_infos_val, filename) + + kitti_infos_test = get_kitti_image_info( + data_path, + training=False, + label_info=False, + velodyne=True, + calib=True, + image_ids=test_img_ids, + relative_path=relative_path) + filename = save_path / f'{pkl_prefix}_infos_test.pkl' + print(f'Kitti info test file is saved to {filename}') + mmcv.dump(kitti_infos_test, filename) + + +def create_waymo_info_file(data_path, + pkl_prefix='waymo', + save_path=None, + relative_path=True, + max_sweeps=5): + """Create info file of waymo dataset. + + Given the raw data, generate its related info file in pkl format. + + Args: + data_path (str): Path of the data root. + pkl_prefix (str): Prefix of the info file to be generated. + save_path (str | None): Path to save the info file. + relative_path (bool): Whether to use relative path. + max_sweeps (int): Max sweeps before the detection frame to be used. + """ + imageset_folder = Path(data_path) / 'ImageSets' + train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt')) + # val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt')) + # test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt')) + train_img_ids = [each for each in train_img_ids if each % 5 == 0] + print('Generate info. this may take several minutes.') + if save_path is None: + save_path = Path(data_path) + else: + save_path = Path(save_path) + waymo_infos_train = get_waymo_image_info( + data_path, + training=True, + velodyne=True, + calib=True, + pose=True, + image_ids=train_img_ids, + relative_path=relative_path, + max_sweeps=max_sweeps) + _calculate_num_points_in_gt( + data_path, + waymo_infos_train, + relative_path, + num_features=6, + remove_outside=False) + filename = save_path / f'{pkl_prefix}_infos_train.pkl' + print(f'Waymo info train file is saved to {filename}') + mmcv.dump(waymo_infos_train, filename) + # + # waymo_infos_val = get_waymo_image_info( + # data_path, + # training=True, + # velodyne=True, + # calib=True, + # pose=True, + # image_ids=val_img_ids, + # relative_path=relative_path, + # max_sweeps=max_sweeps) + # _calculate_num_points_in_gt( + # data_path, + # waymo_infos_val, + # relative_path, + # num_features=6, + # remove_outside=False) + # filename = save_path / f'{pkl_prefix}_infos_val.pkl' + # print(f'Waymo info val file is saved to {filename}') + # mmcv.dump(waymo_infos_val, filename) + # filename = save_path / f'{pkl_prefix}_infos_trainval.pkl' + # print(f'Waymo info trainval file is saved to {filename}') + # mmcv.dump(waymo_infos_train + waymo_infos_val, filename) + # waymo_infos_test = get_waymo_image_info( + # data_path, + # training=False, + # label_info=False, + # velodyne=True, + # calib=True, + # pose=True, + # image_ids=test_img_ids, + # relative_path=relative_path, + # max_sweeps=max_sweeps) + # filename = save_path / f'{pkl_prefix}_infos_test.pkl' + # print(f'Waymo info test file is saved to {filename}') + # mmcv.dump(waymo_infos_test, filename) + + +def _create_reduced_point_cloud(data_path, + info_path, + save_path=None, + back=False, + num_features=4, + front_camera_id=2): + """Create reduced point clouds for given info. + + Args: + data_path (str): Path of original data. + info_path (str): Path of data info. + save_path (str | None): Path to save reduced point cloud data. + Default: None. + back (bool): Whether to flip the points to back. + num_features (int): Number of point features. Default: 4. + front_camera_id (int): The referenced/front camera ID. Default: 2. + """ + kitti_infos = mmcv.load(info_path) + + for info in mmcv.track_iter_progress(kitti_infos): + pc_info = info['point_cloud'] + image_info = info['image'] + calib = info['calib'] + + v_path = pc_info['velodyne_path'] + v_path = Path(data_path) / v_path + points_v = np.fromfile( + str(v_path), dtype=np.float32, + count=-1).reshape([-1, num_features]) + rect = calib['R0_rect'] + if front_camera_id == 2: + P2 = calib['P2'] + else: + P2 = calib[f'P{str(front_camera_id)}'] + Trv2c = calib['Tr_velo_to_cam'] + # first remove z < 0 points + # keep = points_v[:, -1] > 0 + # points_v = points_v[keep] + # then remove outside. + if back: + points_v[:, 0] = -points_v[:, 0] + points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2, + image_info['image_shape']) + if save_path is None: + save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced') + if not save_dir.exists(): + save_dir.mkdir() + save_filename = save_dir / v_path.name + # save_filename = str(v_path) + '_reduced' + if back: + save_filename += '_back' + else: + save_filename = str(Path(save_path) / v_path.name) + if back: + save_filename += '_back' + with open(save_filename, 'w') as f: + points_v.tofile(f) + + +def create_reduced_point_cloud(data_path, + pkl_prefix, + train_info_path=None, + val_info_path=None, + test_info_path=None, + save_path=None, + with_back=False): + """Create reduced point clouds for training/validation/testing. + + Args: + data_path (str): Path of original data. + pkl_prefix (str): Prefix of info files. + train_info_path (str | None): Path of training set info. + Default: None. + val_info_path (str | None): Path of validation set info. + Default: None. + test_info_path (str | None): Path of test set info. + Default: None. + save_path (str | None): Path to save reduced point cloud data. + with_back (bool): Whether to flip the points to back. + """ + if train_info_path is None: + train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl' + if val_info_path is None: + val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl' + if test_info_path is None: + test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl' + + print('create reduced point cloud for training set') + _create_reduced_point_cloud(data_path, train_info_path, save_path) + print('create reduced point cloud for validation set') + _create_reduced_point_cloud(data_path, val_info_path, save_path) + print('create reduced point cloud for testing set') + _create_reduced_point_cloud(data_path, test_info_path, save_path) + if with_back: + _create_reduced_point_cloud( + data_path, train_info_path, save_path, back=True) + _create_reduced_point_cloud( + data_path, val_info_path, save_path, back=True) + _create_reduced_point_cloud( + data_path, test_info_path, save_path, back=True) + + +def export_2d_annotation(root_path, info_path, mono3d=True): + """Export 2d annotation from the info file and raw data. + + Args: + root_path (str): Root path of the raw data. + info_path (str): Path of the info file. + mono3d (bool): Whether to export mono3d annotation. Default: True. + """ + # get bbox annotations for camera + kitti_infos = mmcv.load(info_path) + cat2Ids = [ + dict(id=kitti_categories.index(cat_name), name=cat_name) + for cat_name in kitti_categories + ] + coco_ann_id = 0 + coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) + from os import path as osp + for info in mmcv.track_iter_progress(kitti_infos): + coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d) + (height, width, + _) = mmcv.imread(osp.join(root_path, + info['image']['image_path'])).shape + coco_2d_dict['images'].append( + dict( + file_name=info['image']['image_path'], + id=info['image']['image_idx'], + Tri2v=info['calib']['Tr_imu_to_velo'], + Trv2c=info['calib']['Tr_velo_to_cam'], + rect=info['calib']['R0_rect'], + cam_intrinsic=info['calib']['P2'], + width=width, + height=height)) + for coco_info in coco_infos: + if coco_info is None: + continue + # add an empty key for coco format + coco_info['segmentation'] = [] + coco_info['id'] = coco_ann_id + coco_2d_dict['annotations'].append(coco_info) + coco_ann_id += 1 + if mono3d: + json_prefix = f'{info_path[:-4]}_mono3d' + else: + json_prefix = f'{info_path[:-4]}' + mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json') + + +def get_2d_boxes(info, occluded, mono3d=True): + """Get the 2D annotation records for a given info. + + Args: + info: Information of the given sample data. + occluded: Integer (0, 1, 2, 3) indicating occlusion state: \ + 0 = fully visible, 1 = partly occluded, 2 = largely occluded, \ + 3 = unknown, -1 = DontCare + mono3d (bool): Whether to get boxes with mono3d annotation. + + Return: + list[dict]: List of 2D annotation record that belongs to the input + `sample_data_token`. + """ + # Get calibration information + P2 = info['calib']['P2'] + + repro_recs = [] + # if no annotations in info (test dataset), then return + if 'annos' not in info: + return repro_recs + + # Get all the annotation with the specified visibilties. + ann_dicts = info['annos'] + mask = [(ocld in occluded) for ocld in ann_dicts['occluded']] + for k in ann_dicts.keys(): + ann_dicts[k] = ann_dicts[k][mask] + + # convert dict of list to list of dict + ann_recs = [] + for i in range(len(ann_dicts['occluded'])): + ann_rec = {} + for k in ann_dicts.keys(): + ann_rec[k] = ann_dicts[k][i] + ann_recs.append(ann_rec) + + for ann_idx, ann_rec in enumerate(ann_recs): + # Augment sample_annotation with token information. + ann_rec['sample_annotation_token'] = \ + f"{info['image']['image_idx']}.{ann_idx}" + ann_rec['sample_data_token'] = info['image']['image_idx'] + sample_data_token = info['image']['image_idx'] + + loc = ann_rec['location'][np.newaxis, :] + dim = ann_rec['dimensions'][np.newaxis, :] + rot = ann_rec['rotation_y'][np.newaxis, np.newaxis] + # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5] + dst = np.array([0.5, 0.5, 0.5]) + src = np.array([0.5, 1.0, 0.5]) + loc = loc + dim * (dst - src) + offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \ + / info['calib']['P2'][0, 0] + loc_3d = np.copy(loc) + loc_3d[0, 0] += offset + gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32) + + # Filter out the corners that are not in front of the calibrated + # sensor. + corners_3d = box_np_ops.center_to_corner_box3d( + gt_bbox_3d[:, :3], + gt_bbox_3d[:, 3:6], + gt_bbox_3d[:, 6], [0.5, 0.5, 0.5], + axis=1) + corners_3d = corners_3d[0].T # (1, 8, 3) -> (3, 8) + in_front = np.argwhere(corners_3d[2, :] > 0).flatten() + corners_3d = corners_3d[:, in_front] + + # Project 3d box to 2d. + camera_intrinsic = P2 + corner_coords = view_points(corners_3d, camera_intrinsic, + True).T[:, :2].tolist() + + # Keep only corners that fall within the image. + final_coords = post_process_coords(corner_coords) + + # Skip if the convex hull of the re-projected corners + # does not intersect the image canvas. + if final_coords is None: + continue + else: + min_x, min_y, max_x, max_y = final_coords + + # Generate dictionary record to be included in the .json file. + repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y, + sample_data_token, + info['image']['image_path']) + + # If mono3d=True, add 3D annotations in camera coordinates + if mono3d and (repro_rec is not None): + repro_rec['bbox_cam3d'] = np.concatenate( + [loc_3d, dim, rot], + axis=1).astype(np.float32).squeeze().tolist() + repro_rec['velo_cam3d'] = -1 # no velocity in KITTI + + center3d = np.array(loc).reshape([1, 3]) + center2d = box_np_ops.points_cam2img( + center3d, camera_intrinsic, with_depth=True) + repro_rec['center2d'] = center2d.squeeze().tolist() + # normalized center2D + depth + # samples with depth < 0 will be removed + if repro_rec['center2d'][2] <= 0: + continue + + repro_rec['attribute_name'] = -1 # no attribute in KITTI + repro_rec['attribute_id'] = -1 + + repro_recs.append(repro_rec) + + return repro_recs + + +def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename): + """Generate one 2D annotation record given various informations on top of + the 2D bounding box coordinates. + + Args: + ann_rec (dict): Original 3d annotation record. + x1 (float): Minimum value of the x coordinate. + y1 (float): Minimum value of the y coordinate. + x2 (float): Maximum value of the x coordinate. + y2 (float): Maximum value of the y coordinate. + sample_data_token (str): Sample data token. + filename (str):The corresponding image file where the annotation + is present. + + Returns: + dict: A sample 2D annotation record. + - file_name (str): flie name + - image_id (str): sample data token + - area (float): 2d box area + - category_name (str): category name + - category_id (int): category id + - bbox (list[float]): left x, top y, dx, dy of 2d box + - iscrowd (int): whether the area is crowd + """ + repro_rec = OrderedDict() + repro_rec['sample_data_token'] = sample_data_token + coco_rec = dict() + + key_mapping = { + 'name': 'category_name', + 'num_points_in_gt': 'num_lidar_pts', + 'sample_annotation_token': 'sample_annotation_token', + 'sample_data_token': 'sample_data_token', + } + + for key, value in ann_rec.items(): + if key in key_mapping.keys(): + repro_rec[key_mapping[key]] = value + + repro_rec['bbox_corners'] = [x1, y1, x2, y2] + repro_rec['filename'] = filename + + coco_rec['file_name'] = filename + coco_rec['image_id'] = sample_data_token + coco_rec['area'] = (y2 - y1) * (x2 - x1) + + if repro_rec['category_name'] not in kitti_categories: + return None + cat_name = repro_rec['category_name'] + coco_rec['category_name'] = cat_name + coco_rec['category_id'] = kitti_categories.index(cat_name) + coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1] + coco_rec['iscrowd'] = 0 + + return coco_rec diff --git a/model_examples/MapTR/tools/data_converter/kitti_data_utils.py b/model_examples/MapTR/tools/data_converter/kitti_data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..01538e065bafe6bbae1ece233d09c87037fe4044 --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/kitti_data_utils.py @@ -0,0 +1,554 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from collections import OrderedDict +from concurrent import futures as futures +from os import path as osp +from pathlib import Path +from skimage import io + + +def get_image_index_str(img_idx, use_prefix_id=False): + if use_prefix_id: + return '{:07d}'.format(img_idx) + else: + return '{:06d}'.format(img_idx) + + +def get_kitti_info_path(idx, + prefix, + info_type='image_2', + file_tail='.png', + training=True, + relative_path=True, + exist_check=True, + use_prefix_id=False): + img_idx_str = get_image_index_str(idx, use_prefix_id) + img_idx_str += file_tail + prefix = Path(prefix) + if training: + file_path = Path('training') / info_type / img_idx_str + else: + file_path = Path('testing') / info_type / img_idx_str + if exist_check and not (prefix / file_path).exists(): + raise ValueError('file not exist: {}'.format(file_path)) + if relative_path: + return str(file_path) + else: + return str(prefix / file_path) + + +def get_image_path(idx, + prefix, + training=True, + relative_path=True, + exist_check=True, + info_type='image_2', + use_prefix_id=False): + return get_kitti_info_path(idx, prefix, info_type, '.png', training, + relative_path, exist_check, use_prefix_id) + + +def get_label_path(idx, + prefix, + training=True, + relative_path=True, + exist_check=True, + info_type='label_2', + use_prefix_id=False): + return get_kitti_info_path(idx, prefix, info_type, '.txt', training, + relative_path, exist_check, use_prefix_id) + + +def get_velodyne_path(idx, + prefix, + training=True, + relative_path=True, + exist_check=True, + use_prefix_id=False): + return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training, + relative_path, exist_check, use_prefix_id) + + +def get_calib_path(idx, + prefix, + training=True, + relative_path=True, + exist_check=True, + use_prefix_id=False): + return get_kitti_info_path(idx, prefix, 'calib', '.txt', training, + relative_path, exist_check, use_prefix_id) + + +def get_pose_path(idx, + prefix, + training=True, + relative_path=True, + exist_check=True, + use_prefix_id=False): + return get_kitti_info_path(idx, prefix, 'pose', '.txt', training, + relative_path, exist_check, use_prefix_id) + + +def get_label_anno(label_path): + annotations = {} + annotations.update({ + 'name': [], + 'truncated': [], + 'occluded': [], + 'alpha': [], + 'bbox': [], + 'dimensions': [], + 'location': [], + 'rotation_y': [] + }) + with open(label_path, 'r') as f: + lines = f.readlines() + # if len(lines) == 0 or len(lines[0]) < 15: + # content = [] + # else: + content = [line.strip().split(' ') for line in lines] + num_objects = len([x[0] for x in content if x[0] != 'DontCare']) + annotations['name'] = np.array([x[0] for x in content]) + num_gt = len(annotations['name']) + annotations['truncated'] = np.array([float(x[1]) for x in content]) + annotations['occluded'] = np.array([int(x[2]) for x in content]) + annotations['alpha'] = np.array([float(x[3]) for x in content]) + annotations['bbox'] = np.array([[float(info) for info in x[4:8]] + for x in content]).reshape(-1, 4) + # dimensions will convert hwl format to standard lhw(camera) format. + annotations['dimensions'] = np.array([[float(info) for info in x[8:11]] + for x in content + ]).reshape(-1, 3)[:, [2, 0, 1]] + annotations['location'] = np.array([[float(info) for info in x[11:14]] + for x in content]).reshape(-1, 3) + annotations['rotation_y'] = np.array([float(x[14]) + for x in content]).reshape(-1) + if len(content) != 0 and len(content[0]) == 16: # have score + annotations['score'] = np.array([float(x[15]) for x in content]) + else: + annotations['score'] = np.zeros((annotations['bbox'].shape[0], )) + index = list(range(num_objects)) + [-1] * (num_gt - num_objects) + annotations['index'] = np.array(index, dtype=np.int32) + annotations['group_ids'] = np.arange(num_gt, dtype=np.int32) + return annotations + + +def _extend_matrix(mat): + mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0) + return mat + + +def get_kitti_image_info(path, + training=True, + label_info=True, + velodyne=False, + calib=False, + image_ids=7481, + extend_matrix=True, + num_worker=8, + relative_path=True, + with_imageshape=True): + """ + KITTI annotation format version 2: + { + [optional]points: [N, 3+] point cloud + [optional, for kitti]image: { + image_idx: ... + image_path: ... + image_shape: ... + } + point_cloud: { + num_features: 4 + velodyne_path: ... + } + [optional, for kitti]calib: { + R0_rect: ... + Tr_velo_to_cam: ... + P2: ... + } + annos: { + location: [num_gt, 3] array + dimensions: [num_gt, 3] array + rotation_y: [num_gt] angle array + name: [num_gt] ground truth name array + [optional]difficulty: kitti difficulty + [optional]group_ids: used for multi-part object + } + } + """ + root_path = Path(path) + if not isinstance(image_ids, list): + image_ids = list(range(image_ids)) + + def map_func(idx): + info = {} + pc_info = {'num_features': 4} + calib_info = {} + + image_info = {'image_idx': idx} + annotations = None + if velodyne: + pc_info['velodyne_path'] = get_velodyne_path( + idx, path, training, relative_path) + image_info['image_path'] = get_image_path(idx, path, training, + relative_path) + if with_imageshape: + img_path = image_info['image_path'] + if relative_path: + img_path = str(root_path / img_path) + image_info['image_shape'] = np.array( + io.imread(img_path).shape[:2], dtype=np.int32) + if label_info: + label_path = get_label_path(idx, path, training, relative_path) + if relative_path: + label_path = str(root_path / label_path) + annotations = get_label_anno(label_path) + info['image'] = image_info + info['point_cloud'] = pc_info + if calib: + calib_path = get_calib_path( + idx, path, training, relative_path=False) + with open(calib_path, 'r') as f: + lines = f.readlines() + P0 = np.array([float(info) for info in lines[0].split(' ')[1:13] + ]).reshape([3, 4]) + P1 = np.array([float(info) for info in lines[1].split(' ')[1:13] + ]).reshape([3, 4]) + P2 = np.array([float(info) for info in lines[2].split(' ')[1:13] + ]).reshape([3, 4]) + P3 = np.array([float(info) for info in lines[3].split(' ')[1:13] + ]).reshape([3, 4]) + if extend_matrix: + P0 = _extend_matrix(P0) + P1 = _extend_matrix(P1) + P2 = _extend_matrix(P2) + P3 = _extend_matrix(P3) + R0_rect = np.array([ + float(info) for info in lines[4].split(' ')[1:10] + ]).reshape([3, 3]) + if extend_matrix: + rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype) + rect_4x4[3, 3] = 1. + rect_4x4[:3, :3] = R0_rect + else: + rect_4x4 = R0_rect + + Tr_velo_to_cam = np.array([ + float(info) for info in lines[5].split(' ')[1:13] + ]).reshape([3, 4]) + Tr_imu_to_velo = np.array([ + float(info) for info in lines[6].split(' ')[1:13] + ]).reshape([3, 4]) + if extend_matrix: + Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam) + Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo) + calib_info['P0'] = P0 + calib_info['P1'] = P1 + calib_info['P2'] = P2 + calib_info['P3'] = P3 + calib_info['R0_rect'] = rect_4x4 + calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam + calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo + info['calib'] = calib_info + + if annotations is not None: + info['annos'] = annotations + add_difficulty_to_annos(info) + return info + + with futures.ThreadPoolExecutor(num_worker) as executor: + image_infos = executor.map(map_func, image_ids) + + return list(image_infos) + + +def get_waymo_image_info(path, + training=True, + label_info=True, + velodyne=False, + calib=False, + pose=False, + image_ids=7481, + extend_matrix=True, + num_worker=8, + relative_path=True, + with_imageshape=True, + max_sweeps=5): + """ + Waymo annotation format version like KITTI: + { + [optional]points: [N, 3+] point cloud + [optional, for kitti]image: { + image_idx: ... + image_path: ... + image_shape: ... + } + point_cloud: { + num_features: 6 + velodyne_path: ... + } + [optional, for kitti]calib: { + R0_rect: ... + Tr_velo_to_cam0: ... + P0: ... + } + annos: { + location: [num_gt, 3] array + dimensions: [num_gt, 3] array + rotation_y: [num_gt] angle array + name: [num_gt] ground truth name array + [optional]difficulty: kitti difficulty + [optional]group_ids: used for multi-part object + } + } + """ + root_path = Path(path) + if not isinstance(image_ids, list): + image_ids = list(range(image_ids)) + + def map_func(idx): + info = {} + pc_info = {'num_features': 6} + calib_info = {} + + image_info = {'image_idx': idx} + annotations = None + if velodyne: + pc_info['velodyne_path'] = get_velodyne_path( + idx, path, training, relative_path, use_prefix_id=True) + points = np.fromfile( + Path(path) / pc_info['velodyne_path'], dtype=np.float32) + points = np.copy(points).reshape(-1, pc_info['num_features']) + info['timestamp'] = np.int64(points[0, -1]) + # values of the last dim are all the timestamp + image_info['image_path'] = get_image_path( + idx, + path, + training, + relative_path, + info_type='image_0', + use_prefix_id=True) + if with_imageshape: + img_path = image_info['image_path'] + if relative_path: + img_path = str(root_path / img_path) + image_info['image_shape'] = np.array( + io.imread(img_path).shape[:2], dtype=np.int32) + if label_info: + label_path = get_label_path( + idx, + path, + training, + relative_path, + info_type='label_all', + use_prefix_id=True) + if relative_path: + label_path = str(root_path / label_path) + annotations = get_label_anno(label_path) + info['image'] = image_info + info['point_cloud'] = pc_info + if calib: + calib_path = get_calib_path( + idx, path, training, relative_path=False, use_prefix_id=True) + with open(calib_path, 'r') as f: + lines = f.readlines() + P0 = np.array([float(info) for info in lines[0].split(' ')[1:13] + ]).reshape([3, 4]) + P1 = np.array([float(info) for info in lines[1].split(' ')[1:13] + ]).reshape([3, 4]) + P2 = np.array([float(info) for info in lines[2].split(' ')[1:13] + ]).reshape([3, 4]) + P3 = np.array([float(info) for info in lines[3].split(' ')[1:13] + ]).reshape([3, 4]) + P4 = np.array([float(info) for info in lines[4].split(' ')[1:13] + ]).reshape([3, 4]) + if extend_matrix: + P0 = _extend_matrix(P0) + P1 = _extend_matrix(P1) + P2 = _extend_matrix(P2) + P3 = _extend_matrix(P3) + P4 = _extend_matrix(P4) + R0_rect = np.array([ + float(info) for info in lines[5].split(' ')[1:10] + ]).reshape([3, 3]) + if extend_matrix: + rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype) + rect_4x4[3, 3] = 1. + rect_4x4[:3, :3] = R0_rect + else: + rect_4x4 = R0_rect + + Tr_velo_to_cam = np.array([ + float(info) for info in lines[6].split(' ')[1:13] + ]).reshape([3, 4]) + if extend_matrix: + Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam) + calib_info['P0'] = P0 + calib_info['P1'] = P1 + calib_info['P2'] = P2 + calib_info['P3'] = P3 + calib_info['P4'] = P4 + calib_info['R0_rect'] = rect_4x4 + calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam + info['calib'] = calib_info + if pose: + pose_path = get_pose_path( + idx, path, training, relative_path=False, use_prefix_id=True) + info['pose'] = np.loadtxt(pose_path) + + if annotations is not None: + info['annos'] = annotations + info['annos']['camera_id'] = info['annos'].pop('score') + add_difficulty_to_annos(info) + + sweeps = [] + prev_idx = idx + while len(sweeps) < max_sweeps: + prev_info = {} + prev_idx -= 1 + prev_info['velodyne_path'] = get_velodyne_path( + prev_idx, + path, + training, + relative_path, + exist_check=False, + use_prefix_id=True) + if_prev_exists = osp.exists( + Path(path) / prev_info['velodyne_path']) + if if_prev_exists: + prev_points = np.fromfile( + Path(path) / prev_info['velodyne_path'], dtype=np.float32) + prev_points = np.copy(prev_points).reshape( + -1, pc_info['num_features']) + prev_info['timestamp'] = np.int64(prev_points[0, -1]) + prev_pose_path = get_pose_path( + prev_idx, + path, + training, + relative_path=False, + use_prefix_id=True) + prev_info['pose'] = np.loadtxt(prev_pose_path) + sweeps.append(prev_info) + else: + break + info['sweeps'] = sweeps + + return info + + with futures.ThreadPoolExecutor(num_worker) as executor: + image_infos = executor.map(map_func, image_ids) + + return list(image_infos) + + +def kitti_anno_to_label_file(annos, folder): + folder = Path(folder) + for anno in annos: + image_idx = anno['metadata']['image_idx'] + label_lines = [] + for j in range(anno['bbox'].shape[0]): + label_dict = { + 'name': anno['name'][j], + 'alpha': anno['alpha'][j], + 'bbox': anno['bbox'][j], + 'location': anno['location'][j], + 'dimensions': anno['dimensions'][j], + 'rotation_y': anno['rotation_y'][j], + 'score': anno['score'][j], + } + label_line = kitti_result_line(label_dict) + label_lines.append(label_line) + label_file = folder / f'{get_image_index_str(image_idx)}.txt' + label_str = '\n'.join(label_lines) + with open(label_file, 'w') as f: + f.write(label_str) + + +def add_difficulty_to_annos(info): + min_height = [40, 25, + 25] # minimum height for evaluated groundtruth/detections + max_occlusion = [ + 0, 1, 2 + ] # maximum occlusion level of the groundtruth used for evaluation + max_trunc = [ + 0.15, 0.3, 0.5 + ] # maximum truncation level of the groundtruth used for evaluation + annos = info['annos'] + dims = annos['dimensions'] # lhw format + bbox = annos['bbox'] + height = bbox[:, 3] - bbox[:, 1] + occlusion = annos['occluded'] + truncation = annos['truncated'] + diff = [] + easy_mask = np.ones((len(dims), ), dtype=np.bool) + moderate_mask = np.ones((len(dims), ), dtype=np.bool) + hard_mask = np.ones((len(dims), ), dtype=np.bool) + i = 0 + for h, o, t in zip(height, occlusion, truncation): + if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]: + easy_mask[i] = False + if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]: + moderate_mask[i] = False + if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]: + hard_mask[i] = False + i += 1 + is_easy = easy_mask + is_moderate = np.logical_xor(easy_mask, moderate_mask) + is_hard = np.logical_xor(hard_mask, moderate_mask) + + for i in range(len(dims)): + if is_easy[i]: + diff.append(0) + elif is_moderate[i]: + diff.append(1) + elif is_hard[i]: + diff.append(2) + else: + diff.append(-1) + annos['difficulty'] = np.array(diff, np.int32) + return diff + + +def kitti_result_line(result_dict, precision=4): + prec_float = '{' + ':.{}f'.format(precision) + '}' + res_line = [] + all_field_default = OrderedDict([ + ('name', None), + ('truncated', -1), + ('occluded', -1), + ('alpha', -10), + ('bbox', None), + ('dimensions', [-1, -1, -1]), + ('location', [-1000, -1000, -1000]), + ('rotation_y', -10), + ('score', 0.0), + ]) + res_dict = [(key, None) for key, val in all_field_default.items()] + res_dict = OrderedDict(res_dict) + for key, val in result_dict.items(): + if all_field_default[key] is None and val is None: + raise ValueError('you must specify a value for {}'.format(key)) + res_dict[key] = val + + for key, val in res_dict.items(): + if key == 'name': + res_line.append(val) + elif key in ['truncated', 'alpha', 'rotation_y', 'score']: + if val is None: + res_line.append(str(all_field_default[key])) + else: + res_line.append(prec_float.format(val)) + elif key == 'occluded': + if val is None: + res_line.append(str(all_field_default[key])) + else: + res_line.append('{}'.format(val)) + elif key in ['bbox', 'dimensions', 'location']: + if val is None: + res_line += [str(v) for v in all_field_default[key]] + else: + res_line += [prec_float.format(v) for v in val] + else: + raise ValueError('unknown key. supported key:{}'.format( + res_dict.keys())) + return ' '.join(res_line) diff --git a/model_examples/MapTR/tools/data_converter/lyft_converter.py b/model_examples/MapTR/tools/data_converter/lyft_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..1fc1555a265b04aa3d32fed9d8efb90323943406 --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/lyft_converter.py @@ -0,0 +1,268 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +import os +from logging import warning +from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft +from os import path as osp +from pyquaternion import Quaternion + +from mmdet3d.datasets import LyftDataset +from .nuscenes_converter import (get_2d_boxes, get_available_scenes, + obtain_sensor2top) + +lyft_categories = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', + 'motorcycle', 'bicycle', 'pedestrian', 'animal') + + +def create_lyft_infos(root_path, + info_prefix, + version='v1.01-train', + max_sweeps=10): + """Create info file of lyft dataset. + + Given the raw data, generate its related info file in pkl format. + + Args: + root_path (str): Path of the data root. + info_prefix (str): Prefix of the info file to be generated. + version (str): Version of the data. + Default: 'v1.01-train' + max_sweeps (int): Max number of sweeps. + Default: 10 + """ + lyft = Lyft( + data_path=osp.join(root_path, version), + json_path=osp.join(root_path, version, version), + verbose=True) + available_vers = ['v1.01-train', 'v1.01-test'] + assert version in available_vers + if version == 'v1.01-train': + train_scenes = mmcv.list_from_file('data/lyft/train.txt') + val_scenes = mmcv.list_from_file('data/lyft/val.txt') + elif version == 'v1.01-test': + train_scenes = mmcv.list_from_file('data/lyft/test.txt') + val_scenes = [] + else: + raise ValueError('unknown') + + # filter existing scenes. + available_scenes = get_available_scenes(lyft) + available_scene_names = [s['name'] for s in available_scenes] + train_scenes = list( + filter(lambda x: x in available_scene_names, train_scenes)) + val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) + train_scenes = set([ + available_scenes[available_scene_names.index(s)]['token'] + for s in train_scenes + ]) + val_scenes = set([ + available_scenes[available_scene_names.index(s)]['token'] + for s in val_scenes + ]) + + test = 'test' in version + if test: + print(f'test scene: {len(train_scenes)}') + else: + print(f'train scene: {len(train_scenes)}, \ + val scene: {len(val_scenes)}') + train_lyft_infos, val_lyft_infos = _fill_trainval_infos( + lyft, train_scenes, val_scenes, test, max_sweeps=max_sweeps) + + metadata = dict(version=version) + if test: + print(f'test sample: {len(train_lyft_infos)}') + data = dict(infos=train_lyft_infos, metadata=metadata) + info_name = f'{info_prefix}_infos_test' + info_path = osp.join(root_path, f'{info_name}.pkl') + mmcv.dump(data, info_path) + else: + print(f'train sample: {len(train_lyft_infos)}, \ + val sample: {len(val_lyft_infos)}') + data = dict(infos=train_lyft_infos, metadata=metadata) + train_info_name = f'{info_prefix}_infos_train' + info_path = osp.join(root_path, f'{train_info_name}.pkl') + mmcv.dump(data, info_path) + data['infos'] = val_lyft_infos + val_info_name = f'{info_prefix}_infos_val' + info_val_path = osp.join(root_path, f'{val_info_name}.pkl') + mmcv.dump(data, info_val_path) + + +def _fill_trainval_infos(lyft, + train_scenes, + val_scenes, + test=False, + max_sweeps=10): + """Generate the train/val infos from the raw data. + + Args: + lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset. + train_scenes (list[str]): Basic information of training scenes. + val_scenes (list[str]): Basic information of validation scenes. + test (bool): Whether use the test mode. In the test mode, no + annotations can be accessed. Default: False. + max_sweeps (int): Max number of sweeps. Default: 10. + + Returns: + tuple[list[dict]]: Information of training set and + validation set that will be saved to the info file. + """ + train_lyft_infos = [] + val_lyft_infos = [] + + for sample in mmcv.track_iter_progress(lyft.sample): + lidar_token = sample['data']['LIDAR_TOP'] + sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP']) + cs_record = lyft.get('calibrated_sensor', + sd_rec['calibrated_sensor_token']) + pose_record = lyft.get('ego_pose', sd_rec['ego_pose_token']) + abs_lidar_path, boxes, _ = lyft.get_sample_data(lidar_token) + # nuScenes devkit returns more convenient relative paths while + # lyft devkit returns absolute paths + abs_lidar_path = str(abs_lidar_path) # absolute path + lidar_path = abs_lidar_path.split(f'{os.getcwd()}/')[-1] + # relative path + + mmcv.check_file_exist(lidar_path) + + info = { + 'lidar_path': lidar_path, + 'token': sample['token'], + 'sweeps': [], + 'cams': dict(), + 'lidar2ego_translation': cs_record['translation'], + 'lidar2ego_rotation': cs_record['rotation'], + 'ego2global_translation': pose_record['translation'], + 'ego2global_rotation': pose_record['rotation'], + 'timestamp': sample['timestamp'], + } + + l2e_r = info['lidar2ego_rotation'] + l2e_t = info['lidar2ego_translation'] + e2g_r = info['ego2global_rotation'] + e2g_t = info['ego2global_translation'] + l2e_r_mat = Quaternion(l2e_r).rotation_matrix + e2g_r_mat = Quaternion(e2g_r).rotation_matrix + + # obtain 6 image's information per frame + camera_types = [ + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_FRONT_LEFT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_BACK_RIGHT', + ] + for cam in camera_types: + cam_token = sample['data'][cam] + cam_path, _, cam_intrinsic = lyft.get_sample_data(cam_token) + cam_info = obtain_sensor2top(lyft, cam_token, l2e_t, l2e_r_mat, + e2g_t, e2g_r_mat, cam) + cam_info.update(cam_intrinsic=cam_intrinsic) + info['cams'].update({cam: cam_info}) + + # obtain sweeps for a single key-frame + sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP']) + sweeps = [] + while len(sweeps) < max_sweeps: + if not sd_rec['prev'] == '': + sweep = obtain_sensor2top(lyft, sd_rec['prev'], l2e_t, + l2e_r_mat, e2g_t, e2g_r_mat, 'lidar') + sweeps.append(sweep) + sd_rec = lyft.get('sample_data', sd_rec['prev']) + else: + break + info['sweeps'] = sweeps + # obtain annotation + if not test: + annotations = [ + lyft.get('sample_annotation', token) + for token in sample['anns'] + ] + locs = np.array([b.center for b in boxes]).reshape(-1, 3) + dims = np.array([b.wlh for b in boxes]).reshape(-1, 3) + rots = np.array([b.orientation.yaw_pitch_roll[0] + for b in boxes]).reshape(-1, 1) + + names = [b.name for b in boxes] + for i in range(len(names)): + if names[i] in LyftDataset.NameMapping: + names[i] = LyftDataset.NameMapping[names[i]] + names = np.array(names) + + # we need to convert rot to SECOND format. + gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1) + assert len(gt_boxes) == len( + annotations), f'{len(gt_boxes)}, {len(annotations)}' + info['gt_boxes'] = gt_boxes + info['gt_names'] = names + info['num_lidar_pts'] = np.array( + [a['num_lidar_pts'] for a in annotations]) + info['num_radar_pts'] = np.array( + [a['num_radar_pts'] for a in annotations]) + + if sample['scene_token'] in train_scenes: + train_lyft_infos.append(info) + else: + val_lyft_infos.append(info) + + return train_lyft_infos, val_lyft_infos + + +def export_2d_annotation(root_path, info_path, version): + """Export 2d annotation from the info file and raw data. + + Args: + root_path (str): Root path of the raw data. + info_path (str): Path of the info file. + version (str): Dataset version. + """ + warning.warn('DeprecationWarning: 2D annotations are not used on the ' + 'Lyft dataset. The function export_2d_annotation will be ' + 'deprecated.') + # get bbox annotations for camera + camera_types = [ + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_FRONT_LEFT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_BACK_RIGHT', + ] + lyft_infos = mmcv.load(info_path)['infos'] + lyft = Lyft( + data_path=osp.join(root_path, version), + json_path=osp.join(root_path, version, version), + verbose=True) + # info_2d_list = [] + cat2Ids = [ + dict(id=lyft_categories.index(cat_name), name=cat_name) + for cat_name in lyft_categories + ] + coco_ann_id = 0 + coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) + for info in mmcv.track_iter_progress(lyft_infos): + for cam in camera_types: + cam_info = info['cams'][cam] + coco_infos = get_2d_boxes( + lyft, + cam_info['sample_data_token'], + visibilities=['', '1', '2', '3', '4']) + (height, width, _) = mmcv.imread(cam_info['data_path']).shape + coco_2d_dict['images'].append( + dict( + file_name=cam_info['data_path'], + id=cam_info['sample_data_token'], + width=width, + height=height)) + for coco_info in coco_infos: + if coco_info is None: + continue + # add an empty key for coco format + coco_info['segmentation'] = [] + coco_info['id'] = coco_ann_id + coco_2d_dict['annotations'].append(coco_info) + coco_ann_id += 1 + mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json') diff --git a/model_examples/MapTR/tools/data_converter/lyft_data_fixer.py b/model_examples/MapTR/tools/data_converter/lyft_data_fixer.py new file mode 100644 index 0000000000000000000000000000000000000000..42070490ca49e96a1b076ce2cb98ad4f55f7deaa --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/lyft_data_fixer.py @@ -0,0 +1,38 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import numpy as np +import os + + +def fix_lyft(root_folder='./data/lyft', version='v1.01'): + # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000 # noqa + lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin' + root_folder = os.path.join(root_folder, f'{version}-train') + lidar_path = os.path.join(root_folder, lidar_path) + assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \ + f'dataset and make sure {lidar_path} is present.' + points = np.fromfile(lidar_path, dtype=np.float32, count=-1) + try: + points.reshape([-1, 5]) + print(f'This fix is not required for version {version}.') + except ValueError: + new_points = np.array(list(points) + [100.0, 1.0], dtype='float32') + new_points.tofile(lidar_path) + print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.') + + +parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser') +parser.add_argument( + '--root-folder', + type=str, + default='./data/lyft', + help='specify the root path of Lyft dataset') +parser.add_argument( + '--version', + type=str, + default='v1.01', + help='specify Lyft dataset version') +args = parser.parse_args() + +if __name__ == '__main__': + fix_lyft(root_folder=args.root_folder, version=args.version) diff --git a/model_examples/MapTR/tools/data_converter/nuimage_converter.py b/model_examples/MapTR/tools/data_converter/nuimage_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..92be1de3dbd1d57603c73b5c7b2ab443d44c9d11 --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/nuimage_converter.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import base64 +import mmcv +import numpy as np +from nuimages import NuImages +from nuimages.utils.utils import mask_decode, name_to_index_mapping +from os import path as osp + +nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', + 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', + 'barrier') + +NAME_MAPPING = { + 'movable_object.barrier': 'barrier', + 'vehicle.bicycle': 'bicycle', + 'vehicle.bus.bendy': 'bus', + 'vehicle.bus.rigid': 'bus', + 'vehicle.car': 'car', + 'vehicle.construction': 'construction_vehicle', + 'vehicle.motorcycle': 'motorcycle', + 'human.pedestrian.adult': 'pedestrian', + 'human.pedestrian.child': 'pedestrian', + 'human.pedestrian.construction_worker': 'pedestrian', + 'human.pedestrian.police_officer': 'pedestrian', + 'movable_object.trafficcone': 'traffic_cone', + 'vehicle.trailer': 'trailer', + 'vehicle.truck': 'truck', +} + + +def parse_args(): + parser = argparse.ArgumentParser(description='Data converter arg parser') + parser.add_argument( + '--data-root', + type=str, + default='./data/nuimages', + help='specify the root path of dataset') + parser.add_argument( + '--version', + type=str, + nargs='+', + default=['v1.0-mini'], + required=False, + help='specify the dataset version') + parser.add_argument( + '--out-dir', + type=str, + default='./data/nuimages/annotations/', + required=False, + help='path to save the exported json') + parser.add_argument( + '--nproc', + type=int, + default=4, + required=False, + help='workers to process semantic masks') + parser.add_argument('--extra-tag', type=str, default='nuimages') + args = parser.parse_args() + return args + + +def get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root): + """Get semantic segmentation map for an image. + + Args: + nuim (obj:`NuImages`): NuImages dataset object + img_info (dict): Meta information of img + + Returns: + np.ndarray: Semantic segmentation map of the image + """ + sd_token = img_info['token'] + image_id = img_info['id'] + name_to_index = name_to_index_mapping(nuim.category) + + # Get image data. + width, height = img_info['width'], img_info['height'] + semseg_mask = np.zeros((height, width)).astype('uint8') + + # Load stuff / surface regions. + surface_anns = [ + o for o in nuim.surface_ann if o['sample_data_token'] == sd_token + ] + + # Draw stuff / surface regions. + for ann in surface_anns: + # Get color and mask. + category_token = ann['category_token'] + category_name = nuim.get('category', category_token)['name'] + if ann['mask'] is None: + continue + mask = mask_decode(ann['mask']) + + # Draw mask for semantic segmentation. + semseg_mask[mask == 1] = name_to_index[category_name] + + # Load object instances. + object_anns = [ + o for o in nuim.object_ann if o['sample_data_token'] == sd_token + ] + + # Sort by token to ensure that objects always appear in the + # instance mask in the same order. + object_anns = sorted(object_anns, key=lambda k: k['token']) + + # Draw object instances. + # The 0 index is reserved for background; thus, the instances + # should start from index 1. + annotations = [] + for i, ann in enumerate(object_anns, start=1): + # Get color, box, mask and name. + category_token = ann['category_token'] + category_name = nuim.get('category', category_token)['name'] + if ann['mask'] is None: + continue + mask = mask_decode(ann['mask']) + + # Draw masks for semantic segmentation and instance segmentation. + semseg_mask[mask == 1] = name_to_index[category_name] + + if category_name in NAME_MAPPING: + cat_name = NAME_MAPPING[category_name] + cat_id = cat2id[cat_name] + + x_min, y_min, x_max, y_max = ann['bbox'] + # encode calibrated instance mask + mask_anno = dict() + mask_anno['counts'] = base64.b64decode( + ann['mask']['counts']).decode() + mask_anno['size'] = ann['mask']['size'] + + data_anno = dict( + image_id=image_id, + category_id=cat_id, + bbox=[x_min, y_min, x_max - x_min, y_max - y_min], + area=(x_max - x_min) * (y_max - y_min), + segmentation=mask_anno, + iscrowd=0) + annotations.append(data_anno) + + # after process, save semantic masks + img_filename = img_info['file_name'] + seg_filename = img_filename.replace('jpg', 'png') + seg_filename = osp.join(seg_root, seg_filename) + mmcv.imwrite(semseg_mask, seg_filename) + return annotations, np.max(semseg_mask) + + +def export_nuim_to_coco(nuim, data_root, out_dir, extra_tag, version, nproc): + print('Process category information') + categories = [] + categories = [ + dict(id=nus_categories.index(cat_name), name=cat_name) + for cat_name in nus_categories + ] + cat2id = {k_v['name']: k_v['id'] for k_v in categories} + + images = [] + print('Process image meta information...') + for sample_info in mmcv.track_iter_progress(nuim.sample_data): + if sample_info['is_key_frame']: + img_idx = len(images) + images.append( + dict( + id=img_idx, + token=sample_info['token'], + file_name=sample_info['filename'], + width=sample_info['width'], + height=sample_info['height'])) + + seg_root = f'{out_dir}semantic_masks' + mmcv.mkdir_or_exist(seg_root) + mmcv.mkdir_or_exist(osp.join(data_root, 'calibrated')) + + global process_img_anno + + def process_img_anno(img_info): + single_img_annos, max_cls_id = get_img_annos(nuim, img_info, cat2id, + out_dir, data_root, + seg_root) + return single_img_annos, max_cls_id + + print('Process img annotations...') + if nproc > 1: + outputs = mmcv.track_parallel_progress( + process_img_anno, images, nproc=nproc) + else: + outputs = [] + for img_info in mmcv.track_iter_progress(images): + outputs.append(process_img_anno(img_info)) + + # Determine the index of object annotation + print('Process annotation information...') + annotations = [] + max_cls_ids = [] + for single_img_annos, max_cls_id in outputs: + max_cls_ids.append(max_cls_id) + for img_anno in single_img_annos: + img_anno.update(id=len(annotations)) + annotations.append(img_anno) + + max_cls_id = max(max_cls_ids) + print(f'Max ID of class in the semantic map: {max_cls_id}') + + coco_format_json = dict( + images=images, annotations=annotations, categories=categories) + + mmcv.mkdir_or_exist(out_dir) + out_file = osp.join(out_dir, f'{extra_tag}_{version}.json') + print(f'Annotation dumped to {out_file}') + mmcv.dump(coco_format_json, out_file) + + +def main(): + args = parse_args() + for version in args.version: + nuim = NuImages( + dataroot=args.data_root, version=version, verbose=True, lazy=True) + export_nuim_to_coco(nuim, args.data_root, args.out_dir, args.extra_tag, + version, args.nproc) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/data_converter/nuscenes_converter.py b/model_examples/MapTR/tools/data_converter/nuscenes_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..5a1d995cdbafbb731364080be2478ea65eab574e --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/nuscenes_converter.py @@ -0,0 +1,675 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import mmcv +import numpy as np +import os +from collections import OrderedDict +from nuscenes.nuscenes import NuScenes +from nuscenes.utils.geometry_utils import view_points +from os import path as osp +from pyquaternion import Quaternion +from shapely.geometry import MultiPoint, box +from typing import List, Tuple, Union + +from mmdet3d.core.bbox.box_np_ops import points_cam2img +from mmdet3d.datasets import NuScenesDataset + +nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', + 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', + 'barrier') + +nus_attributes = ('cycle.with_rider', 'cycle.without_rider', + 'pedestrian.moving', 'pedestrian.standing', + 'pedestrian.sitting_lying_down', 'vehicle.moving', + 'vehicle.parked', 'vehicle.stopped', 'None') + + +def create_nuscenes_infos(root_path, + out_path, + can_bus_root_path, + info_prefix, + version='v1.0-trainval', + max_sweeps=10): + """Create info file of nuscene dataset. + + Given the raw data, generate its related info file in pkl format. + + Args: + root_path (str): Path of the data root. + info_prefix (str): Prefix of the info file to be generated. + version (str): Version of the data. + Default: 'v1.0-trainval' + max_sweeps (int): Max number of sweeps. + Default: 10 + """ + from nuscenes.nuscenes import NuScenes + from nuscenes.can_bus.can_bus_api import NuScenesCanBus + print(version, root_path) + nusc = NuScenes(version=version, dataroot=root_path, verbose=True) + nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path) + from nuscenes.utils import splits + available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini'] + assert version in available_vers + if version == 'v1.0-trainval': + train_scenes = splits.train + val_scenes = splits.val + elif version == 'v1.0-test': + train_scenes = splits.test + val_scenes = [] + elif version == 'v1.0-mini': + train_scenes = splits.mini_train + val_scenes = splits.mini_val + else: + raise ValueError('unknown') + + # filter existing scenes. + available_scenes = get_available_scenes(nusc) + available_scene_names = [s['name'] for s in available_scenes] + train_scenes = list( + filter(lambda x: x in available_scene_names, train_scenes)) + val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) + train_scenes = set([ + available_scenes[available_scene_names.index(s)]['token'] + for s in train_scenes + ]) + val_scenes = set([ + available_scenes[available_scene_names.index(s)]['token'] + for s in val_scenes + ]) + + test = 'test' in version + if test: + print('test scene: {}'.format(len(train_scenes))) + else: + print('train scene: {}, val scene: {}'.format( + len(train_scenes), len(val_scenes))) + + train_nusc_infos, val_nusc_infos = _fill_trainval_infos( + nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps) + + metadata = dict(version=version) + if test: + print('test sample: {}'.format(len(train_nusc_infos))) + data = dict(infos=train_nusc_infos, metadata=metadata) + info_path = osp.join(out_path, + '{}_infos_temporal_test.pkl'.format(info_prefix)) + mmcv.dump(data, info_path) + else: + print('train sample: {}, val sample: {}'.format( + len(train_nusc_infos), len(val_nusc_infos))) + data = dict(infos=train_nusc_infos, metadata=metadata) + info_path = osp.join(out_path, + '{}_infos_temporal_train.pkl'.format(info_prefix)) + mmcv.dump(data, info_path) + data['infos'] = val_nusc_infos + info_val_path = osp.join(out_path, + '{}_infos_temporal_val.pkl'.format(info_prefix)) + mmcv.dump(data, info_val_path) + + +def get_available_scenes(nusc): + """Get available scenes from the input nuscenes class. + + Given the raw data, get the information of available scenes for + further info generation. + + Args: + nusc (class): Dataset class in the nuScenes dataset. + + Returns: + available_scenes (list[dict]): List of basic information for the + available scenes. + """ + available_scenes = [] + print('total scene num: {}'.format(len(nusc.scene))) + for scene in nusc.scene: + scene_token = scene['token'] + scene_rec = nusc.get('scene', scene_token) + sample_rec = nusc.get('sample', scene_rec['first_sample_token']) + sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + has_more_frames = True + scene_not_exist = False + while has_more_frames: + lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token']) + lidar_path = str(lidar_path) + if os.getcwd() in lidar_path: + # path from lyftdataset is absolute path + lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1] + # relative path + if not mmcv.is_filepath(lidar_path): + scene_not_exist = True + break + else: + break + if scene_not_exist: + continue + available_scenes.append(scene) + print('exist scene num: {}'.format(len(available_scenes))) + return available_scenes + + +def _get_can_bus_info(nusc, nusc_can_bus, sample): + scene_name = nusc.get('scene', sample['scene_token'])['name'] + sample_timestamp = sample['timestamp'] + try: + pose_list = nusc_can_bus.get_messages(scene_name, 'pose') + except: + return np.zeros(18) # server scenes do not have can bus information. + can_bus = [] + # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp + last_pose = pose_list[0] + for i, pose in enumerate(pose_list): + if pose['utime'] > sample_timestamp: + break + last_pose = pose + _ = last_pose.pop('utime') # useless + pos = last_pose.pop('pos') + rotation = last_pose.pop('orientation') + can_bus.extend(pos) + can_bus.extend(rotation) + for key in last_pose.keys(): + can_bus.extend(pose[key]) # 16 elements + can_bus.extend([0., 0.]) + return np.array(can_bus) + + +def _fill_trainval_infos(nusc, + nusc_can_bus, + train_scenes, + val_scenes, + test=False, + max_sweeps=10): + """Generate the train/val infos from the raw data. + + Args: + nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset. + train_scenes (list[str]): Basic information of training scenes. + val_scenes (list[str]): Basic information of validation scenes. + test (bool): Whether use the test mode. In the test mode, no + annotations can be accessed. Default: False. + max_sweeps (int): Max number of sweeps. Default: 10. + + Returns: + tuple[list[dict]]: Information of training set and validation set + that will be saved to the info file. + """ + train_nusc_infos = [] + val_nusc_infos = [] + frame_idx = 0 + for sample in mmcv.track_iter_progress(nusc.sample): + map_location = nusc.get('log', nusc.get('scene', sample['scene_token'])['log_token'])['location'] + + lidar_token = sample['data']['LIDAR_TOP'] + sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', + sd_rec['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) + lidar_path, boxes, _ = nusc.get_sample_data(lidar_token) + + mmcv.check_file_exist(lidar_path) + can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample) + ## + info = { + 'lidar_path': lidar_path, + 'token': sample['token'], + 'prev': sample['prev'], + 'next': sample['next'], + 'can_bus': can_bus, + 'frame_idx': frame_idx, # temporal related info + 'sweeps': [], + 'cams': dict(), + 'map_location': map_location, + 'scene_token': sample['scene_token'], # temporal related info + 'lidar2ego_translation': cs_record['translation'], + 'lidar2ego_rotation': cs_record['rotation'], + 'ego2global_translation': pose_record['translation'], + 'ego2global_rotation': pose_record['rotation'], + 'timestamp': sample['timestamp'], + } + + if sample['next'] == '': + frame_idx = 0 + else: + frame_idx += 1 + + l2e_r = info['lidar2ego_rotation'] + l2e_t = info['lidar2ego_translation'] + e2g_r = info['ego2global_rotation'] + e2g_t = info['ego2global_translation'] + l2e_r_mat = Quaternion(l2e_r).rotation_matrix + e2g_r_mat = Quaternion(e2g_r).rotation_matrix + + # obtain 6 image's information per frame + camera_types = [ + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_FRONT_LEFT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_BACK_RIGHT', + ] + for cam in camera_types: + cam_token = sample['data'][cam] + cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token) + cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat, + e2g_t, e2g_r_mat, cam) + cam_info.update(cam_intrinsic=cam_intrinsic) + info['cams'].update({cam: cam_info}) + + # obtain sweeps for a single key-frame + sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) + sweeps = [] + while len(sweeps) < max_sweeps: + if not sd_rec['prev'] == '': + sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t, + l2e_r_mat, e2g_t, e2g_r_mat, 'lidar') + sweeps.append(sweep) + sd_rec = nusc.get('sample_data', sd_rec['prev']) + else: + break + info['sweeps'] = sweeps + # obtain annotation + if not test: + annotations = [ + nusc.get('sample_annotation', token) + for token in sample['anns'] + ] + locs = np.array([b.center for b in boxes]).reshape(-1, 3) + dims = np.array([b.wlh for b in boxes]).reshape(-1, 3) + rots = np.array([b.orientation.yaw_pitch_roll[0] + for b in boxes]).reshape(-1, 1) + velocity = np.array( + [nusc.box_velocity(token)[:2] for token in sample['anns']]) + valid_flag = np.array( + [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0 + for anno in annotations], + dtype=bool).reshape(-1) + # convert velo from global to lidar + for i in range(len(boxes)): + velo = np.array([*velocity[i], 0.0]) + velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv( + l2e_r_mat).T + velocity[i] = velo[:2] + + names = [b.name for b in boxes] + for i in range(len(names)): + if names[i] in NuScenesDataset.NameMapping: + names[i] = NuScenesDataset.NameMapping[names[i]] + names = np.array(names) + # we need to convert rot to SECOND format. + gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1) + assert len(gt_boxes) == len( + annotations), f'{len(gt_boxes)}, {len(annotations)}' + info['gt_boxes'] = gt_boxes + info['gt_names'] = names + info['gt_velocity'] = velocity.reshape(-1, 2) + info['num_lidar_pts'] = np.array( + [a['num_lidar_pts'] for a in annotations]) + info['num_radar_pts'] = np.array( + [a['num_radar_pts'] for a in annotations]) + info['valid_flag'] = valid_flag + + if sample['scene_token'] in train_scenes: + train_nusc_infos.append(info) + else: + val_nusc_infos.append(info) + + return train_nusc_infos, val_nusc_infos + + +def obtain_sensor2top(nusc, + sensor_token, + l2e_t, + l2e_r_mat, + e2g_t, + e2g_r_mat, + sensor_type='lidar'): + """Obtain the info with RT matric from general sensor to Top LiDAR. + + Args: + nusc (class): Dataset class in the nuScenes dataset. + sensor_token (str): Sample data token corresponding to the + specific sensor type. + l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3). + l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego + in shape (3, 3). + e2g_t (np.ndarray): Translation from ego to global in shape (1, 3). + e2g_r_mat (np.ndarray): Rotation matrix from ego to global + in shape (3, 3). + sensor_type (str): Sensor to calibrate. Default: 'lidar'. + + Returns: + sweep (dict): Sweep information after transformation. + """ + sd_rec = nusc.get('sample_data', sensor_token) + cs_record = nusc.get('calibrated_sensor', + sd_rec['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) + data_path = str(nusc.get_sample_data_path(sd_rec['token'])) + if os.getcwd() in data_path: # path from lyftdataset is absolute path + data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path + sweep = { + 'data_path': data_path, + 'type': sensor_type, + 'sample_data_token': sd_rec['token'], + 'sensor2ego_translation': cs_record['translation'], + 'sensor2ego_rotation': cs_record['rotation'], + 'ego2global_translation': pose_record['translation'], + 'ego2global_rotation': pose_record['rotation'], + 'timestamp': sd_rec['timestamp'] + } + + l2e_r_s = sweep['sensor2ego_rotation'] + l2e_t_s = sweep['sensor2ego_translation'] + e2g_r_s = sweep['ego2global_rotation'] + e2g_t_s = sweep['ego2global_translation'] + + # obtain the RT from sensor to Top LiDAR + # sweep->ego->global->ego'->lidar + l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix + e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix + R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T + ) + l2e_t @ np.linalg.inv(l2e_r_mat).T + sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T + sweep['sensor2lidar_translation'] = T + return sweep + + +def export_2d_annotation(root_path, info_path, version, mono3d=True): + """Export 2d annotation from the info file and raw data. + + Args: + root_path (str): Root path of the raw data. + info_path (str): Path of the info file. + version (str): Dataset version. + mono3d (bool): Whether to export mono3d annotation. Default: True. + """ + # get bbox annotations for camera + camera_types = [ + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_FRONT_LEFT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_BACK_RIGHT', + ] + nusc_infos = mmcv.load(info_path)['infos'] + nusc = NuScenes(version=version, dataroot=root_path, verbose=True) + # info_2d_list = [] + cat2Ids = [ + dict(id=nus_categories.index(cat_name), name=cat_name) + for cat_name in nus_categories + ] + coco_ann_id = 0 + coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) + for info in mmcv.track_iter_progress(nusc_infos): + for cam in camera_types: + cam_info = info['cams'][cam] + coco_infos = get_2d_boxes( + nusc, + cam_info['sample_data_token'], + visibilities=['', '1', '2', '3', '4'], + mono3d=mono3d) + (height, width, _) = mmcv.imread(cam_info['data_path']).shape + coco_2d_dict['images'].append( + dict( + file_name=cam_info['data_path'].split('data/nuscenes/') + [-1], + id=cam_info['sample_data_token'], + token=info['token'], + cam2ego_rotation=cam_info['sensor2ego_rotation'], + cam2ego_translation=cam_info['sensor2ego_translation'], + ego2global_rotation=info['ego2global_rotation'], + ego2global_translation=info['ego2global_translation'], + cam_intrinsic=cam_info['cam_intrinsic'], + width=width, + height=height)) + for coco_info in coco_infos: + if coco_info is None: + continue + # add an empty key for coco format + coco_info['segmentation'] = [] + coco_info['id'] = coco_ann_id + coco_2d_dict['annotations'].append(coco_info) + coco_ann_id += 1 + if mono3d: + json_prefix = f'{info_path[:-4]}_mono3d' + else: + json_prefix = f'{info_path[:-4]}' + mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json') + + +def get_2d_boxes(nusc, + sample_data_token: str, + visibilities: List[str], + mono3d=True): + """Get the 2D annotation records for a given `sample_data_token`. + + Args: + sample_data_token (str): Sample data token belonging to a camera \ + keyframe. + visibilities (list[str]): Visibility filter. + mono3d (bool): Whether to get boxes with mono3d annotation. + + Return: + list[dict]: List of 2D annotation record that belongs to the input + `sample_data_token`. + """ + + # Get the sample data and the sample corresponding to that sample data. + sd_rec = nusc.get('sample_data', sample_data_token) + + assert sd_rec[ + 'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \ + ' for camera sample_data!' + if not sd_rec['is_key_frame']: + raise ValueError( + 'The 2D re-projections are available only for keyframes.') + + s_rec = nusc.get('sample', sd_rec['sample_token']) + + # Get the calibrated sensor and ego pose + # record to get the transformation matrices. + cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) + pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token']) + camera_intrinsic = np.array(cs_rec['camera_intrinsic']) + + # Get all the annotation with the specified visibilties. + ann_recs = [ + nusc.get('sample_annotation', token) for token in s_rec['anns'] + ] + ann_recs = [ + ann_rec for ann_rec in ann_recs + if (ann_rec['visibility_token'] in visibilities) + ] + + repro_recs = [] + + for ann_rec in ann_recs: + # Augment sample_annotation with token information. + ann_rec['sample_annotation_token'] = ann_rec['token'] + ann_rec['sample_data_token'] = sample_data_token + + # Get the box in global coordinates. + box = nusc.get_box(ann_rec['token']) + + # Move them to the ego-pose frame. + box.translate(-np.array(pose_rec['translation'])) + box.rotate(Quaternion(pose_rec['rotation']).inverse) + + # Move them to the calibrated sensor frame. + box.translate(-np.array(cs_rec['translation'])) + box.rotate(Quaternion(cs_rec['rotation']).inverse) + + # Filter out the corners that are not in front of the calibrated + # sensor. + corners_3d = box.corners() + in_front = np.argwhere(corners_3d[2, :] > 0).flatten() + corners_3d = corners_3d[:, in_front] + + # Project 3d box to 2d. + corner_coords = view_points(corners_3d, camera_intrinsic, + True).T[:, :2].tolist() + + # Keep only corners that fall within the image. + final_coords = post_process_coords(corner_coords) + + # Skip if the convex hull of the re-projected corners + # does not intersect the image canvas. + if final_coords is None: + continue + else: + min_x, min_y, max_x, max_y = final_coords + + # Generate dictionary record to be included in the .json file. + repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y, + sample_data_token, sd_rec['filename']) + + # If mono3d=True, add 3D annotations in camera coordinates + if mono3d and (repro_rec is not None): + loc = box.center.tolist() + + dim = box.wlh + dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw + dim = dim.tolist() + + rot = box.orientation.yaw_pitch_roll[0] + rot = [-rot] # convert the rot to our cam coordinate + + global_velo2d = nusc.box_velocity(box.token)[:2] + global_velo3d = np.array([*global_velo2d, 0.0]) + e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix + c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix + cam_velo3d = global_velo3d @ np.linalg.inv( + e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T + velo = cam_velo3d[0::2].tolist() + + repro_rec['bbox_cam3d'] = loc + dim + rot + repro_rec['velo_cam3d'] = velo + + center3d = np.array(loc).reshape([1, 3]) + center2d = points_cam2img( + center3d, camera_intrinsic, with_depth=True) + repro_rec['center2d'] = center2d.squeeze().tolist() + # normalized center2D + depth + # if samples with depth < 0 will be removed + if repro_rec['center2d'][2] <= 0: + continue + + ann_token = nusc.get('sample_annotation', + box.token)['attribute_tokens'] + if len(ann_token) == 0: + attr_name = 'None' + else: + attr_name = nusc.get('attribute', ann_token[0])['name'] + attr_id = nus_attributes.index(attr_name) + repro_rec['attribute_name'] = attr_name + repro_rec['attribute_id'] = attr_id + + repro_recs.append(repro_rec) + + return repro_recs + + +def post_process_coords( + corner_coords: List, imsize: Tuple[int, int] = (1600, 900) +) -> Union[Tuple[float, float, float, float], None]: + """Get the intersection of the convex hull of the reprojected bbox corners + and the image canvas, return None if no intersection. + + Args: + corner_coords (list[int]): Corner coordinates of reprojected + bounding box. + imsize (tuple[int]): Size of the image canvas. + + Return: + tuple [float]: Intersection of the convex hull of the 2D box + corners and the image canvas. + """ + polygon_from_2d_box = MultiPoint(corner_coords).convex_hull + img_canvas = box(0, 0, imsize[0], imsize[1]) + + if polygon_from_2d_box.intersects(img_canvas): + img_intersection = polygon_from_2d_box.intersection(img_canvas) + intersection_coords = np.array( + [coord for coord in img_intersection.exterior.coords]) + + min_x = min(intersection_coords[:, 0]) + min_y = min(intersection_coords[:, 1]) + max_x = max(intersection_coords[:, 0]) + max_y = max(intersection_coords[:, 1]) + + return min_x, min_y, max_x, max_y + else: + return None + + +def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float, + sample_data_token: str, filename: str) -> OrderedDict: + """Generate one 2D annotation record given various informations on top of + the 2D bounding box coordinates. + + Args: + ann_rec (dict): Original 3d annotation record. + x1 (float): Minimum value of the x coordinate. + y1 (float): Minimum value of the y coordinate. + x2 (float): Maximum value of the x coordinate. + y2 (float): Maximum value of the y coordinate. + sample_data_token (str): Sample data token. + filename (str):The corresponding image file where the annotation + is present. + + Returns: + dict: A sample 2D annotation record. + - file_name (str): flie name + - image_id (str): sample data token + - area (float): 2d box area + - category_name (str): category name + - category_id (int): category id + - bbox (list[float]): left x, top y, dx, dy of 2d box + - iscrowd (int): whether the area is crowd + """ + repro_rec = OrderedDict() + repro_rec['sample_data_token'] = sample_data_token + coco_rec = dict() + + relevant_keys = [ + 'attribute_tokens', + 'category_name', + 'instance_token', + 'next', + 'num_lidar_pts', + 'num_radar_pts', + 'prev', + 'sample_annotation_token', + 'sample_data_token', + 'visibility_token', + ] + + for key, value in ann_rec.items(): + if key in relevant_keys: + repro_rec[key] = value + + repro_rec['bbox_corners'] = [x1, y1, x2, y2] + repro_rec['filename'] = filename + + coco_rec['file_name'] = filename + coco_rec['image_id'] = sample_data_token + coco_rec['area'] = (y2 - y1) * (x2 - x1) + + if repro_rec['category_name'] not in NuScenesDataset.NameMapping: + return None + cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']] + coco_rec['category_name'] = cat_name + coco_rec['category_id'] = nus_categories.index(cat_name) + coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1] + coco_rec['iscrowd'] = 0 + + return coco_rec diff --git a/model_examples/MapTR/tools/data_converter/s3dis_data_utils.py b/model_examples/MapTR/tools/data_converter/s3dis_data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d2b6b773e9c8b19f0df6d1b8c274ccca22c31532 --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/s3dis_data_utils.py @@ -0,0 +1,241 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +import os +from concurrent import futures as futures +from os import path as osp + + +class S3DISData(object): + """S3DIS data. + + Generate s3dis infos for s3dis_converter. + + Args: + root_path (str): Root path of the raw data. + split (str): Set split type of the data. Default: 'Area_1'. + """ + + def __init__(self, root_path, split='Area_1'): + self.root_dir = root_path + self.split = split + self.data_dir = osp.join(root_path, + 'Stanford3dDataset_v1.2_Aligned_Version') + + # Following `GSDN `_, use 5 furniture + # classes for detection: table, chair, sofa, bookcase, board. + self.cat_ids = np.array([7, 8, 9, 10, 11]) + self.cat_ids2class = { + cat_id: i + for i, cat_id in enumerate(list(self.cat_ids)) + } + + assert split in [ + 'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5', 'Area_6' + ] + self.sample_id_list = os.listdir(osp.join(self.data_dir, + split)) # conferenceRoom_1 + for sample_id in self.sample_id_list: + if os.path.isfile(osp.join(self.data_dir, split, sample_id)): + self.sample_id_list.remove(sample_id) + + def __len__(self): + return len(self.sample_id_list) + + def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): + """Get data infos. + + This method gets information from the raw data. + + Args: + num_workers (int): Number of threads to be used. Default: 4. + has_label (bool): Whether the data has label. Default: True. + sample_id_list (list[int]): Index list of the sample. + Default: None. + + Returns: + infos (list[dict]): Information of the raw data. + """ + + def process_single_scene(sample_idx): + print(f'{self.split} sample_idx: {sample_idx}') + info = dict() + pc_info = { + 'num_features': 6, + 'lidar_idx': f'{self.split}_{sample_idx}' + } + info['point_cloud'] = pc_info + pts_filename = osp.join(self.root_dir, 's3dis_data', + f'{self.split}_{sample_idx}_point.npy') + pts_instance_mask_path = osp.join( + self.root_dir, 's3dis_data', + f'{self.split}_{sample_idx}_ins_label.npy') + pts_semantic_mask_path = osp.join( + self.root_dir, 's3dis_data', + f'{self.split}_{sample_idx}_sem_label.npy') + + points = np.load(pts_filename).astype(np.float32) + pts_instance_mask = np.load(pts_instance_mask_path).astype(np.int) + pts_semantic_mask = np.load(pts_semantic_mask_path).astype(np.int) + + mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points')) + mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask')) + mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask')) + + points.tofile( + osp.join(self.root_dir, 'points', + f'{self.split}_{sample_idx}.bin')) + pts_instance_mask.tofile( + osp.join(self.root_dir, 'instance_mask', + f'{self.split}_{sample_idx}.bin')) + pts_semantic_mask.tofile( + osp.join(self.root_dir, 'semantic_mask', + f'{self.split}_{sample_idx}.bin')) + + info['pts_path'] = osp.join('points', + f'{self.split}_{sample_idx}.bin') + info['pts_instance_mask_path'] = osp.join( + 'instance_mask', f'{self.split}_{sample_idx}.bin') + info['pts_semantic_mask_path'] = osp.join( + 'semantic_mask', f'{self.split}_{sample_idx}.bin') + info['annos'] = self.get_bboxes(points, pts_instance_mask, + pts_semantic_mask) + + return info + + sample_id_list = sample_id_list if sample_id_list is not None \ + else self.sample_id_list + with futures.ThreadPoolExecutor(num_workers) as executor: + infos = executor.map(process_single_scene, sample_id_list) + return list(infos) + + def get_bboxes(self, points, pts_instance_mask, pts_semantic_mask): + """Convert instance masks to axis-aligned bounding boxes. + + Args: + points (np.array): Scene points of shape (n, 6). + pts_instance_mask (np.ndarray): Instance labels of shape (n,). + pts_semantic_mask (np.ndarray): Semantic labels of shape (n,). + + Returns: + dict: A dict containing detection infos with following keys: + + - gt_boxes_upright_depth (np.ndarray): Bounding boxes + of shape (n, 6) + - class (np.ndarray): Box labels of shape (n,) + - gt_num (int): Number of boxes. + """ + bboxes, labels = [], [] + for i in range(1, pts_instance_mask.max()): + ids = pts_instance_mask == i + # check if all instance points have same semantic label + assert pts_semantic_mask[ids].min() == pts_semantic_mask[ids].max() + label = pts_semantic_mask[ids][0] + # keep only furniture objects + if label in self.cat_ids2class: + labels.append(self.cat_ids2class[pts_semantic_mask[ids][0]]) + pts = points[:, :3][ids] + min_pts = pts.min(axis=0) + max_pts = pts.max(axis=0) + locations = (min_pts + max_pts) / 2 + dimensions = max_pts - min_pts + bboxes.append(np.concatenate((locations, dimensions))) + annotation = dict() + # follow ScanNet and SUN RGB-D keys + annotation['gt_boxes_upright_depth'] = np.array(bboxes) + annotation['class'] = np.array(labels) + annotation['gt_num'] = len(labels) + return annotation + + +class S3DISSegData(object): + """S3DIS dataset used to generate infos for semantic segmentation task. + + Args: + data_root (str): Root path of the raw data. + ann_file (str): The generated scannet infos. + split (str): Set split type of the data. Default: 'train'. + num_points (int): Number of points in each data input. Default: 8192. + label_weight_func (function): Function to compute the label weight. + Default: None. + """ + + def __init__(self, + data_root, + ann_file, + split='Area_1', + num_points=4096, + label_weight_func=None): + self.data_root = data_root + self.data_infos = mmcv.load(ann_file) + self.split = split + self.num_points = num_points + + self.all_ids = np.arange(13) # all possible ids + self.cat_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12]) # used for seg task + self.ignore_index = len(self.cat_ids) + + self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \ + self.ignore_index + for i, cat_id in enumerate(self.cat_ids): + self.cat_id2class[cat_id] = i + + # label weighting function is taken from + # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24 + self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \ + label_weight_func is None else label_weight_func + + def get_seg_infos(self): + scene_idxs, label_weight = self.get_scene_idxs_and_label_weight() + save_folder = osp.join(self.data_root, 'seg_info') + mmcv.mkdir_or_exist(save_folder) + np.save( + osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'), + scene_idxs) + np.save( + osp.join(save_folder, f'{self.split}_label_weight.npy'), + label_weight) + print(f'{self.split} resampled scene index and label weight saved') + + def _convert_to_label(self, mask): + """Convert class_id in loaded segmentation mask to label.""" + if isinstance(mask, str): + if mask.endswith('npy'): + mask = np.load(mask) + else: + mask = np.fromfile(mask, dtype=np.long) + label = self.cat_id2class[mask] + return label + + def get_scene_idxs_and_label_weight(self): + """Compute scene_idxs for data sampling and label weight for loss \ + calculation. + + We sample more times for scenes with more points. Label_weight is + inversely proportional to number of class points. + """ + num_classes = len(self.cat_ids) + num_point_all = [] + label_weight = np.zeros((num_classes + 1, )) # ignore_index + for data_info in self.data_infos: + label = self._convert_to_label( + osp.join(self.data_root, data_info['pts_semantic_mask_path'])) + num_point_all.append(label.shape[0]) + class_count, _ = np.histogram(label, range(num_classes + 2)) + label_weight += class_count + + # repeat scene_idx for num_scene_point // num_sample_point times + sample_prob = np.array(num_point_all) / float(np.sum(num_point_all)) + num_iter = int(np.sum(num_point_all) / float(self.num_points)) + scene_idxs = [] + for idx in range(len(self.data_infos)): + scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter))) + scene_idxs = np.array(scene_idxs).astype(np.int32) + + # calculate label weight, adopted from PointNet++ + label_weight = label_weight[:-1].astype(np.float32) + label_weight = label_weight / label_weight.sum() + label_weight = self.label_weight_func(label_weight).astype(np.float32) + + return scene_idxs, label_weight diff --git a/model_examples/MapTR/tools/data_converter/scannet_data_utils.py b/model_examples/MapTR/tools/data_converter/scannet_data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a437fe01ce6857d016001589203c3da2e802d6aa --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/scannet_data_utils.py @@ -0,0 +1,293 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +import os +from concurrent import futures as futures +from os import path as osp + + +class ScanNetData(object): + """ScanNet data. + + Generate scannet infos for scannet_converter. + + Args: + root_path (str): Root path of the raw data. + split (str): Set split type of the data. Default: 'train'. + """ + + def __init__(self, root_path, split='train'): + self.root_dir = root_path + self.split = split + self.split_dir = osp.join(root_path) + self.classes = [ + 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', + 'bookshelf', 'picture', 'counter', 'desk', 'curtain', + 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', + 'garbagebin' + ] + self.cat2label = {cat: self.classes.index(cat) for cat in self.classes} + self.label2cat = {self.cat2label[t]: t for t in self.cat2label} + self.cat_ids = np.array( + [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39]) + self.cat_ids2class = { + nyu40id: i + for i, nyu40id in enumerate(list(self.cat_ids)) + } + assert split in ['train', 'val', 'test'] + split_file = osp.join(self.root_dir, 'meta_data', + f'scannetv2_{split}.txt') + mmcv.check_file_exist(split_file) + self.sample_id_list = mmcv.list_from_file(split_file) + self.test_mode = (split == 'test') + + def __len__(self): + return len(self.sample_id_list) + + def get_aligned_box_label(self, idx): + box_file = osp.join(self.root_dir, 'scannet_instance_data', + f'{idx}_aligned_bbox.npy') + mmcv.check_file_exist(box_file) + return np.load(box_file) + + def get_unaligned_box_label(self, idx): + box_file = osp.join(self.root_dir, 'scannet_instance_data', + f'{idx}_unaligned_bbox.npy') + mmcv.check_file_exist(box_file) + return np.load(box_file) + + def get_axis_align_matrix(self, idx): + matrix_file = osp.join(self.root_dir, 'scannet_instance_data', + f'{idx}_axis_align_matrix.npy') + mmcv.check_file_exist(matrix_file) + return np.load(matrix_file) + + def get_images(self, idx): + paths = [] + path = osp.join(self.root_dir, 'posed_images', idx) + for file in sorted(os.listdir(path)): + if file.endswith('.jpg'): + paths.append(osp.join('posed_images', idx, file)) + return paths + + def get_extrinsics(self, idx): + extrinsics = [] + path = osp.join(self.root_dir, 'posed_images', idx) + for file in sorted(os.listdir(path)): + if file.endswith('.txt') and not file == 'intrinsic.txt': + extrinsics.append(np.loadtxt(osp.join(path, file))) + return extrinsics + + def get_intrinsics(self, idx): + matrix_file = osp.join(self.root_dir, 'posed_images', idx, + 'intrinsic.txt') + mmcv.check_file_exist(matrix_file) + return np.loadtxt(matrix_file) + + def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): + """Get data infos. + + This method gets information from the raw data. + + Args: + num_workers (int): Number of threads to be used. Default: 4. + has_label (bool): Whether the data has label. Default: True. + sample_id_list (list[int]): Index list of the sample. + Default: None. + + Returns: + infos (list[dict]): Information of the raw data. + """ + + def process_single_scene(sample_idx): + print(f'{self.split} sample_idx: {sample_idx}') + info = dict() + pc_info = {'num_features': 6, 'lidar_idx': sample_idx} + info['point_cloud'] = pc_info + pts_filename = osp.join(self.root_dir, 'scannet_instance_data', + f'{sample_idx}_vert.npy') + points = np.load(pts_filename) + mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points')) + points.tofile( + osp.join(self.root_dir, 'points', f'{sample_idx}.bin')) + info['pts_path'] = osp.join('points', f'{sample_idx}.bin') + + # update with RGB image paths if exist + if os.path.exists(osp.join(self.root_dir, 'posed_images')): + info['intrinsics'] = self.get_intrinsics(sample_idx) + all_extrinsics = self.get_extrinsics(sample_idx) + all_img_paths = self.get_images(sample_idx) + # some poses in ScanNet are invalid + extrinsics, img_paths = [], [] + for extrinsic, img_path in zip(all_extrinsics, all_img_paths): + if np.all(np.isfinite(extrinsic)): + img_paths.append(img_path) + extrinsics.append(extrinsic) + info['extrinsics'] = extrinsics + info['img_paths'] = img_paths + + if not self.test_mode: + pts_instance_mask_path = osp.join( + self.root_dir, 'scannet_instance_data', + f'{sample_idx}_ins_label.npy') + pts_semantic_mask_path = osp.join( + self.root_dir, 'scannet_instance_data', + f'{sample_idx}_sem_label.npy') + + pts_instance_mask = np.load(pts_instance_mask_path).astype( + np.long) + pts_semantic_mask = np.load(pts_semantic_mask_path).astype( + np.long) + + mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask')) + mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask')) + + pts_instance_mask.tofile( + osp.join(self.root_dir, 'instance_mask', + f'{sample_idx}.bin')) + pts_semantic_mask.tofile( + osp.join(self.root_dir, 'semantic_mask', + f'{sample_idx}.bin')) + + info['pts_instance_mask_path'] = osp.join( + 'instance_mask', f'{sample_idx}.bin') + info['pts_semantic_mask_path'] = osp.join( + 'semantic_mask', f'{sample_idx}.bin') + + if has_label: + annotations = {} + # box is of shape [k, 6 + class] + aligned_box_label = self.get_aligned_box_label(sample_idx) + unaligned_box_label = self.get_unaligned_box_label(sample_idx) + annotations['gt_num'] = aligned_box_label.shape[0] + if annotations['gt_num'] != 0: + aligned_box = aligned_box_label[:, :-1] # k, 6 + unaligned_box = unaligned_box_label[:, :-1] + classes = aligned_box_label[:, -1] # k + annotations['name'] = np.array([ + self.label2cat[self.cat_ids2class[classes[i]]] + for i in range(annotations['gt_num']) + ]) + # default names are given to aligned bbox for compatibility + # we also save unaligned bbox info with marked names + annotations['location'] = aligned_box[:, :3] + annotations['dimensions'] = aligned_box[:, 3:6] + annotations['gt_boxes_upright_depth'] = aligned_box + annotations['unaligned_location'] = unaligned_box[:, :3] + annotations['unaligned_dimensions'] = unaligned_box[:, 3:6] + annotations[ + 'unaligned_gt_boxes_upright_depth'] = unaligned_box + annotations['index'] = np.arange( + annotations['gt_num'], dtype=np.int32) + annotations['class'] = np.array([ + self.cat_ids2class[classes[i]] + for i in range(annotations['gt_num']) + ]) + axis_align_matrix = self.get_axis_align_matrix(sample_idx) + annotations['axis_align_matrix'] = axis_align_matrix # 4x4 + info['annos'] = annotations + return info + + sample_id_list = sample_id_list if sample_id_list is not None \ + else self.sample_id_list + with futures.ThreadPoolExecutor(num_workers) as executor: + infos = executor.map(process_single_scene, sample_id_list) + return list(infos) + + +class ScanNetSegData(object): + """ScanNet dataset used to generate infos for semantic segmentation task. + + Args: + data_root (str): Root path of the raw data. + ann_file (str): The generated scannet infos. + split (str): Set split type of the data. Default: 'train'. + num_points (int): Number of points in each data input. Default: 8192. + label_weight_func (function): Function to compute the label weight. + Default: None. + """ + + def __init__(self, + data_root, + ann_file, + split='train', + num_points=8192, + label_weight_func=None): + self.data_root = data_root + self.data_infos = mmcv.load(ann_file) + self.split = split + assert split in ['train', 'val', 'test'] + self.num_points = num_points + + self.all_ids = np.arange(41) # all possible ids + self.cat_ids = np.array([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, + 39 + ]) # used for seg task + self.ignore_index = len(self.cat_ids) + + self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \ + self.ignore_index + for i, cat_id in enumerate(self.cat_ids): + self.cat_id2class[cat_id] = i + + # label weighting function is taken from + # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24 + self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \ + label_weight_func is None else label_weight_func + + def get_seg_infos(self): + if self.split == 'test': + return + scene_idxs, label_weight = self.get_scene_idxs_and_label_weight() + save_folder = osp.join(self.data_root, 'seg_info') + mmcv.mkdir_or_exist(save_folder) + np.save( + osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'), + scene_idxs) + np.save( + osp.join(save_folder, f'{self.split}_label_weight.npy'), + label_weight) + print(f'{self.split} resampled scene index and label weight saved') + + def _convert_to_label(self, mask): + """Convert class_id in loaded segmentation mask to label.""" + if isinstance(mask, str): + if mask.endswith('npy'): + mask = np.load(mask) + else: + mask = np.fromfile(mask, dtype=np.long) + label = self.cat_id2class[mask] + return label + + def get_scene_idxs_and_label_weight(self): + """Compute scene_idxs for data sampling and label weight for loss \ + calculation. + + We sample more times for scenes with more points. Label_weight is + inversely proportional to number of class points. + """ + num_classes = len(self.cat_ids) + num_point_all = [] + label_weight = np.zeros((num_classes + 1, )) # ignore_index + for data_info in self.data_infos: + label = self._convert_to_label( + osp.join(self.data_root, data_info['pts_semantic_mask_path'])) + num_point_all.append(label.shape[0]) + class_count, _ = np.histogram(label, range(num_classes + 2)) + label_weight += class_count + + # repeat scene_idx for num_scene_point // num_sample_point times + sample_prob = np.array(num_point_all) / float(np.sum(num_point_all)) + num_iter = int(np.sum(num_point_all) / float(self.num_points)) + scene_idxs = [] + for idx in range(len(self.data_infos)): + scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter))) + scene_idxs = np.array(scene_idxs).astype(np.int32) + + # calculate label weight, adopted from PointNet++ + label_weight = label_weight[:-1].astype(np.float32) + label_weight = label_weight / label_weight.sum() + label_weight = self.label_weight_func(label_weight).astype(np.float32) + + return scene_idxs, label_weight diff --git a/model_examples/MapTR/tools/data_converter/sunrgbd_data_utils.py b/model_examples/MapTR/tools/data_converter/sunrgbd_data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9f8a502e901a1ce9f066f8a1546d9d4f4b787b89 --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/sunrgbd_data_utils.py @@ -0,0 +1,221 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +from concurrent import futures as futures +from os import path as osp +from scipy import io as sio + + +def random_sampling(points, num_points, replace=None, return_choices=False): + """Random sampling. + + Sampling point cloud to a certain number of points. + + Args: + points (ndarray): Point cloud. + num_points (int): The number of samples. + replace (bool): Whether the sample is with or without replacement. + return_choices (bool): Whether to return choices. + + Returns: + points (ndarray): Point cloud after sampling. + """ + + if replace is None: + replace = (points.shape[0] < num_points) + choices = np.random.choice(points.shape[0], num_points, replace=replace) + if return_choices: + return points[choices], choices + else: + return points[choices] + + +class SUNRGBDInstance(object): + + def __init__(self, line): + data = line.split(' ') + data[1:] = [float(x) for x in data[1:]] + self.classname = data[0] + self.xmin = data[1] + self.ymin = data[2] + self.xmax = data[1] + data[3] + self.ymax = data[2] + data[4] + self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax]) + self.centroid = np.array([data[5], data[6], data[7]]) + self.w = data[8] + self.l = data[9] # noqa: E741 + self.h = data[10] + self.orientation = np.zeros((3, )) + self.orientation[0] = data[11] + self.orientation[1] = data[12] + self.heading_angle = -1 * np.arctan2(self.orientation[1], + self.orientation[0]) + self.box3d = np.concatenate([ + self.centroid, + np.array([self.l * 2, self.w * 2, self.h * 2, self.heading_angle]) + ]) + + +class SUNRGBDData(object): + """SUNRGBD data. + + Generate scannet infos for sunrgbd_converter. + + Args: + root_path (str): Root path of the raw data. + split (str): Set split type of the data. Default: 'train'. + use_v1 (bool): Whether to use v1. Default: False. + """ + + def __init__(self, root_path, split='train', use_v1=False): + self.root_dir = root_path + self.split = split + self.split_dir = osp.join(root_path, 'sunrgbd_trainval') + self.classes = [ + 'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', + 'night_stand', 'bookshelf', 'bathtub' + ] + self.cat2label = {cat: self.classes.index(cat) for cat in self.classes} + self.label2cat = { + label: self.classes[label] + for label in range(len(self.classes)) + } + assert split in ['train', 'val', 'test'] + split_file = osp.join(self.split_dir, f'{split}_data_idx.txt') + mmcv.check_file_exist(split_file) + self.sample_id_list = map(int, mmcv.list_from_file(split_file)) + self.image_dir = osp.join(self.split_dir, 'image') + self.calib_dir = osp.join(self.split_dir, 'calib') + self.depth_dir = osp.join(self.split_dir, 'depth') + if use_v1: + self.label_dir = osp.join(self.split_dir, 'label_v1') + else: + self.label_dir = osp.join(self.split_dir, 'label') + + def __len__(self): + return len(self.sample_id_list) + + def get_image(self, idx): + img_filename = osp.join(self.image_dir, f'{idx:06d}.jpg') + return mmcv.imread(img_filename) + + def get_image_shape(self, idx): + image = self.get_image(idx) + return np.array(image.shape[:2], dtype=np.int32) + + def get_depth(self, idx): + depth_filename = osp.join(self.depth_dir, f'{idx:06d}.mat') + depth = sio.loadmat(depth_filename)['instance'] + return depth + + def get_calibration(self, idx): + calib_filepath = osp.join(self.calib_dir, f'{idx:06d}.txt') + lines = [line.rstrip() for line in open(calib_filepath)] + Rt = np.array([float(x) for x in lines[0].split(' ')]) + Rt = np.reshape(Rt, (3, 3), order='F').astype(np.float32) + K = np.array([float(x) for x in lines[1].split(' ')]) + K = np.reshape(K, (3, 3), order='F').astype(np.float32) + return K, Rt + + def get_label_objects(self, idx): + label_filename = osp.join(self.label_dir, f'{idx:06d}.txt') + lines = [line.rstrip() for line in open(label_filename)] + objects = [SUNRGBDInstance(line) for line in lines] + return objects + + def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): + """Get data infos. + + This method gets information from the raw data. + + Args: + num_workers (int): Number of threads to be used. Default: 4. + has_label (bool): Whether the data has label. Default: True. + sample_id_list (list[int]): Index list of the sample. + Default: None. + + Returns: + infos (list[dict]): Information of the raw data. + """ + + def process_single_scene(sample_idx): + print(f'{self.split} sample_idx: {sample_idx}') + # convert depth to points + SAMPLE_NUM = 50000 + # TODO: Check whether can move the point + # sampling process during training. + pc_upright_depth = self.get_depth(sample_idx) + pc_upright_depth_subsampled = random_sampling( + pc_upright_depth, SAMPLE_NUM) + + info = dict() + pc_info = {'num_features': 6, 'lidar_idx': sample_idx} + info['point_cloud'] = pc_info + + mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points')) + pc_upright_depth_subsampled.tofile( + osp.join(self.root_dir, 'points', f'{sample_idx:06d}.bin')) + + info['pts_path'] = osp.join('points', f'{sample_idx:06d}.bin') + img_path = osp.join('image', f'{sample_idx:06d}.jpg') + image_info = { + 'image_idx': sample_idx, + 'image_shape': self.get_image_shape(sample_idx), + 'image_path': img_path + } + info['image'] = image_info + + K, Rt = self.get_calibration(sample_idx) + calib_info = {'K': K, 'Rt': Rt} + info['calib'] = calib_info + + if has_label: + obj_list = self.get_label_objects(sample_idx) + annotations = {} + annotations['gt_num'] = len([ + obj.classname for obj in obj_list + if obj.classname in self.cat2label.keys() + ]) + if annotations['gt_num'] != 0: + annotations['name'] = np.array([ + obj.classname for obj in obj_list + if obj.classname in self.cat2label.keys() + ]) + annotations['bbox'] = np.concatenate([ + obj.box2d.reshape(1, 4) for obj in obj_list + if obj.classname in self.cat2label.keys() + ], + axis=0) + annotations['location'] = np.concatenate([ + obj.centroid.reshape(1, 3) for obj in obj_list + if obj.classname in self.cat2label.keys() + ], + axis=0) + annotations['dimensions'] = 2 * np.array([ + [obj.l, obj.w, obj.h] for obj in obj_list + if obj.classname in self.cat2label.keys() + ]) # lwh (depth) format + annotations['rotation_y'] = np.array([ + obj.heading_angle for obj in obj_list + if obj.classname in self.cat2label.keys() + ]) + annotations['index'] = np.arange( + len(obj_list), dtype=np.int32) + annotations['class'] = np.array([ + self.cat2label[obj.classname] for obj in obj_list + if obj.classname in self.cat2label.keys() + ]) + annotations['gt_boxes_upright_depth'] = np.stack( + [ + obj.box3d for obj in obj_list + if obj.classname in self.cat2label.keys() + ], + axis=0) # (K,8) + info['annos'] = annotations + return info + + sample_id_list = sample_id_list if \ + sample_id_list is not None else self.sample_id_list + with futures.ThreadPoolExecutor(num_workers) as executor: + infos = executor.map(process_single_scene, sample_id_list) + return list(infos) diff --git a/model_examples/MapTR/tools/data_converter/waymo_converter.py b/model_examples/MapTR/tools/data_converter/waymo_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..94fcae1a3795f3a1e5ff8a31dfa238948ac9e87f --- /dev/null +++ b/model_examples/MapTR/tools/data_converter/waymo_converter.py @@ -0,0 +1,519 @@ +# Copyright (c) OpenMMLab. All rights reserved. +r"""Adapted from `Waymo to KITTI converter + `_. +""" + +try: + from waymo_open_dataset import dataset_pb2 +except ImportError: + raise ImportError( + 'Please run "pip install waymo-open-dataset-tf-2-2-0==1.2.0" ' + 'to install the official devkit first.') + +import mmcv +import numpy as np +import tensorflow as tf +from glob import glob +from os.path import join +from waymo_open_dataset.utils import range_image_utils, transform_utils +from waymo_open_dataset.utils.frame_utils import \ + parse_range_image_and_camera_projection + + +class Waymo2KITTI(object): + """Waymo to KITTI converter. + + This class serves as the converter to change the waymo raw data to KITTI + format. + + Args: + load_dir (str): Directory to load waymo raw data. + save_dir (str): Directory to save data in KITTI format. + prefix (str): Prefix of filename. In general, 0 for training, 1 for + validation and 2 for testing. + workers (str): Number of workers for the parallel process. + test_mode (bool): Whether in the test_mode. Default: False. + """ + + def __init__(self, + load_dir, + save_dir, + prefix, + workers=64, + test_mode=False): + self.filter_empty_3dboxes = True + self.filter_no_label_zone_points = True + + self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST'] + + # Only data collected in specific locations will be converted + # If set None, this filter is disabled + # Available options: location_sf (main dataset) + self.selected_waymo_locations = None + self.save_track_id = False + + # turn on eager execution for older tensorflow versions + if int(tf.__version__.split('.')[0]) < 2: + tf.enable_eager_execution() + + self.lidar_list = [ + '_FRONT', '_FRONT_RIGHT', '_FRONT_LEFT', '_SIDE_RIGHT', + '_SIDE_LEFT' + ] + self.type_list = [ + 'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST' + ] + self.waymo_to_kitti_class_map = { + 'UNKNOWN': 'DontCare', + 'PEDESTRIAN': 'Pedestrian', + 'VEHICLE': 'Car', + 'CYCLIST': 'Cyclist', + 'SIGN': 'Sign' # not in kitti + } + + self.load_dir = load_dir + self.save_dir = save_dir + self.prefix = prefix + self.workers = int(workers) + self.test_mode = test_mode + + self.tfrecord_pathnames = sorted( + glob(join(self.load_dir, '*.tfrecord'))) + + self.label_save_dir = f'{self.save_dir}/label_' + self.label_all_save_dir = f'{self.save_dir}/label_all' + self.image_save_dir = f'{self.save_dir}/image_' + self.calib_save_dir = f'{self.save_dir}/calib' + self.point_cloud_save_dir = f'{self.save_dir}/velodyne' + self.pose_save_dir = f'{self.save_dir}/pose' + + self.create_folder() + + def convert(self): + """Convert action.""" + print('Start converting ...') + mmcv.track_parallel_progress(self.convert_one, range(len(self)), + self.workers) + print('\nFinished ...') + + def convert_one(self, file_idx): + """Convert action for single file. + + Args: + file_idx (int): Index of the file to be converted. + """ + pathname = self.tfrecord_pathnames[file_idx] + dataset = tf.data.TFRecordDataset(pathname, compression_type='') + + for frame_idx, data in enumerate(dataset): + + if frame_idx % 5 != 0: + continue + # print(frame_idx) + frame = dataset_pb2.Frame() + frame.ParseFromString(bytearray(data.numpy())) + if (self.selected_waymo_locations is not None + and frame.context.stats.location + not in self.selected_waymo_locations): + continue + + self.save_image(frame, file_idx, frame_idx) + self.save_calib(frame, file_idx, frame_idx) + self.save_lidar(frame, file_idx, frame_idx) + self.save_pose(frame, file_idx, frame_idx) + + if not self.test_mode: + self.save_label(frame, file_idx, frame_idx) + + def __len__(self): + """Length of the filename list.""" + return len(self.tfrecord_pathnames) + + def save_image(self, frame, file_idx, frame_idx): + """Parse and save the images in png format. + + Args: + frame (:obj:`Frame`): Open dataset frame proto. + file_idx (int): Current file index. + frame_idx (int): Current frame index. + """ + for img in frame.images: + img_path = f'{self.image_save_dir}{str(img.name - 1)}/' + \ + f'{self.prefix}{str(file_idx).zfill(3)}' + \ + f'{str(frame_idx).zfill(3)}.png' + img = mmcv.imfrombytes(img.image) + mmcv.imwrite(img, img_path) + + def save_calib(self, frame, file_idx, frame_idx): + """Parse and save the calibration data. + + Args: + frame (:obj:`Frame`): Open dataset frame proto. + file_idx (int): Current file index. + frame_idx (int): Current frame index. + """ + # waymo front camera to kitti reference camera + T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0], + [1.0, 0.0, 0.0]]) + camera_calibs = [] + R0_rect = [f'{i:e}' for i in np.eye(3).flatten()] + Tr_velo_to_cams = [] + calib_context = '' + + for camera in frame.context.camera_calibrations: + # extrinsic parameters + T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape( + 4, 4) + T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle) + Tr_velo_to_cam = \ + self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam + if camera.name == 1: # FRONT = 1, see dataset.proto for details + self.T_velo_to_front_cam = Tr_velo_to_cam.copy() + Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, )) + Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam]) + + # intrinsic parameters + camera_calib = np.zeros((3, 4)) + camera_calib[0, 0] = camera.intrinsic[0] + camera_calib[1, 1] = camera.intrinsic[1] + camera_calib[0, 2] = camera.intrinsic[2] + camera_calib[1, 2] = camera.intrinsic[3] + camera_calib[2, 2] = 1 + camera_calib = list(camera_calib.reshape(12)) + camera_calib = [f'{i:e}' for i in camera_calib] + camera_calibs.append(camera_calib) + + # all camera ids are saved as id-1 in the result because + # camera 0 is unknown in the proto + for i in range(5): + calib_context += 'P' + str(i) + ': ' + \ + ' '.join(camera_calibs[i]) + '\n' + calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n' + for i in range(5): + calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \ + ' '.join(Tr_velo_to_cams[i]) + '\n' + + with open( + f'{self.calib_save_dir}/{self.prefix}' + + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', + 'w+') as fp_calib: + fp_calib.write(calib_context) + fp_calib.close() + + def save_lidar(self, frame, file_idx, frame_idx): + """Parse and save the lidar data in psd format. + + Args: + frame (:obj:`Frame`): Open dataset frame proto. + file_idx (int): Current file index. + frame_idx (int): Current frame index. + """ + range_images, camera_projections, range_image_top_pose = \ + parse_range_image_and_camera_projection(frame) + + # First return + points_0, cp_points_0, intensity_0, elongation_0 = \ + self.convert_range_image_to_point_cloud( + frame, + range_images, + camera_projections, + range_image_top_pose, + ri_index=0 + ) + points_0 = np.concatenate(points_0, axis=0) + intensity_0 = np.concatenate(intensity_0, axis=0) + elongation_0 = np.concatenate(elongation_0, axis=0) + + # Second return + points_1, cp_points_1, intensity_1, elongation_1 = \ + self.convert_range_image_to_point_cloud( + frame, + range_images, + camera_projections, + range_image_top_pose, + ri_index=1 + ) + points_1 = np.concatenate(points_1, axis=0) + intensity_1 = np.concatenate(intensity_1, axis=0) + elongation_1 = np.concatenate(elongation_1, axis=0) + + points = np.concatenate([points_0, points_1], axis=0) + intensity = np.concatenate([intensity_0, intensity_1], axis=0) + elongation = np.concatenate([elongation_0, elongation_1], axis=0) + timestamp = frame.timestamp_micros * np.ones_like(intensity) + + # concatenate x,y,z, intensity, elongation, timestamp (6-dim) + point_cloud = np.column_stack( + (points, intensity, elongation, timestamp)) + + pc_path = f'{self.point_cloud_save_dir}/{self.prefix}' + \ + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin' + point_cloud.astype(np.float32).tofile(pc_path) + + def save_label(self, frame, file_idx, frame_idx): + """Parse and save the label data in txt format. + The relation between waymo and kitti coordinates is noteworthy: + 1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti) + 2. x-y-z: front-left-up (waymo) -> right-down-front(kitti) + 3. bbox origin at volumetric center (waymo) -> bottom center (kitti) + 4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo) + + Args: + frame (:obj:`Frame`): Open dataset frame proto. + file_idx (int): Current file index. + frame_idx (int): Current frame index. + """ + fp_label_all = open( + f'{self.label_all_save_dir}/{self.prefix}' + + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+') + id_to_bbox = dict() + id_to_name = dict() + for labels in frame.projected_lidar_labels: + name = labels.name + for label in labels.labels: + # TODO: need a workaround as bbox may not belong to front cam + bbox = [ + label.box.center_x - label.box.length / 2, + label.box.center_y - label.box.width / 2, + label.box.center_x + label.box.length / 2, + label.box.center_y + label.box.width / 2 + ] + id_to_bbox[label.id] = bbox + id_to_name[label.id] = name - 1 + + for obj in frame.laser_labels: + bounding_box = None + name = None + id = obj.id + for lidar in self.lidar_list: + if id + lidar in id_to_bbox: + bounding_box = id_to_bbox.get(id + lidar) + name = str(id_to_name.get(id + lidar)) + break + + if bounding_box is None or name is None: + name = '0' + bounding_box = (0, 0, 0, 0) + + my_type = self.type_list[obj.type] + + if my_type not in self.selected_waymo_classes: + continue + + if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1: + continue + + my_type = self.waymo_to_kitti_class_map[my_type] + + height = obj.box.height + width = obj.box.width + length = obj.box.length + + x = obj.box.center_x + y = obj.box.center_y + z = obj.box.center_z - height / 2 + + # project bounding box to the virtual reference frame + pt_ref = self.T_velo_to_front_cam @ \ + np.array([x, y, z, 1]).reshape((4, 1)) + x, y, z, _ = pt_ref.flatten().tolist() + + rotation_y = -obj.box.heading - np.pi / 2 + track_id = obj.id + + # not available + truncated = 0 + occluded = 0 + alpha = -10 + + line = my_type + \ + ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format( + round(truncated, 2), occluded, round(alpha, 2), + round(bounding_box[0], 2), round(bounding_box[1], 2), + round(bounding_box[2], 2), round(bounding_box[3], 2), + round(height, 2), round(width, 2), round(length, 2), + round(x, 2), round(y, 2), round(z, 2), + round(rotation_y, 2)) + + if self.save_track_id: + line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n' + else: + line_all = line[:-1] + ' ' + name + '\n' + + fp_label = open( + f'{self.label_save_dir}{name}/{self.prefix}' + + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'a') + fp_label.write(line) + fp_label.close() + + fp_label_all.write(line_all) + + fp_label_all.close() + + def save_pose(self, frame, file_idx, frame_idx): + """Parse and save the pose data. + + Note that SDC's own pose is not included in the regular training + of KITTI dataset. KITTI raw dataset contains ego motion files + but are not often used. Pose is important for algorithms that + take advantage of the temporal information. + + Args: + frame (:obj:`Frame`): Open dataset frame proto. + file_idx (int): Current file index. + frame_idx (int): Current frame index. + """ + pose = np.array(frame.pose.transform).reshape(4, 4) + np.savetxt( + join(f'{self.pose_save_dir}/{self.prefix}' + + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'), + pose) + + def create_folder(self): + """Create folder for data preprocessing.""" + if not self.test_mode: + dir_list1 = [ + self.label_all_save_dir, self.calib_save_dir, + self.point_cloud_save_dir, self.pose_save_dir + ] + dir_list2 = [self.label_save_dir, self.image_save_dir] + else: + dir_list1 = [ + self.calib_save_dir, self.point_cloud_save_dir, + self.pose_save_dir + ] + dir_list2 = [self.image_save_dir] + for d in dir_list1: + mmcv.mkdir_or_exist(d) + for d in dir_list2: + for i in range(5): + mmcv.mkdir_or_exist(f'{d}{str(i)}') + + def convert_range_image_to_point_cloud(self, + frame, + range_images, + camera_projections, + range_image_top_pose, + ri_index=0): + """Convert range images to point cloud. + + Args: + frame (:obj:`Frame`): Open dataset frame. + range_images (dict): Mapping from laser_name to list of two + range images corresponding with two returns. + camera_projections (dict): Mapping from laser_name to list of two + camera projections corresponding with two returns. + range_image_top_pose (:obj:`Transform`): Range image pixel pose for + top lidar. + ri_index (int): 0 for the first return, 1 for the second return. + Default: 0. + + Returns: + tuple[list[np.ndarray]]: (List of points with shape [N, 3], + camera projections of points with shape [N, 6], intensity + with shape [N, 1], elongation with shape [N, 1]). All the + lists have the length of lidar numbers (5). + """ + calibrations = sorted( + frame.context.laser_calibrations, key=lambda c: c.name) + points = [] + cp_points = [] + intensity = [] + elongation = [] + + frame_pose = tf.convert_to_tensor( + value=np.reshape(np.array(frame.pose.transform), [4, 4])) + # [H, W, 6] + range_image_top_pose_tensor = tf.reshape( + tf.convert_to_tensor(value=range_image_top_pose.data), + range_image_top_pose.shape.dims) + # [H, W, 3, 3] + range_image_top_pose_tensor_rotation = \ + transform_utils.get_rotation_matrix( + range_image_top_pose_tensor[..., 0], + range_image_top_pose_tensor[..., 1], + range_image_top_pose_tensor[..., 2]) + range_image_top_pose_tensor_translation = \ + range_image_top_pose_tensor[..., 3:] + range_image_top_pose_tensor = transform_utils.get_transform( + range_image_top_pose_tensor_rotation, + range_image_top_pose_tensor_translation) + for c in calibrations: + range_image = range_images[c.name][ri_index] + if len(c.beam_inclinations) == 0: + beam_inclinations = range_image_utils.compute_inclination( + tf.constant( + [c.beam_inclination_min, c.beam_inclination_max]), + height=range_image.shape.dims[0]) + else: + beam_inclinations = tf.constant(c.beam_inclinations) + + beam_inclinations = tf.reverse(beam_inclinations, axis=[-1]) + extrinsic = np.reshape(np.array(c.extrinsic.transform), [4, 4]) + + range_image_tensor = tf.reshape( + tf.convert_to_tensor(value=range_image.data), + range_image.shape.dims) + pixel_pose_local = None + frame_pose_local = None + if c.name == dataset_pb2.LaserName.TOP: + pixel_pose_local = range_image_top_pose_tensor + pixel_pose_local = tf.expand_dims(pixel_pose_local, axis=0) + frame_pose_local = tf.expand_dims(frame_pose, axis=0) + range_image_mask = range_image_tensor[..., 0] > 0 + + if self.filter_no_label_zone_points: + nlz_mask = range_image_tensor[..., 3] != 1.0 # 1.0: in NLZ + range_image_mask = range_image_mask & nlz_mask + + range_image_cartesian = \ + range_image_utils.extract_point_cloud_from_range_image( + tf.expand_dims(range_image_tensor[..., 0], axis=0), + tf.expand_dims(extrinsic, axis=0), + tf.expand_dims(tf.convert_to_tensor( + value=beam_inclinations), axis=0), + pixel_pose=pixel_pose_local, + frame_pose=frame_pose_local) + + range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0) + points_tensor = tf.gather_nd(range_image_cartesian, + tf.compat.v1.where(range_image_mask)) + + cp = camera_projections[c.name][ri_index] + cp_tensor = tf.reshape( + tf.convert_to_tensor(value=cp.data), cp.shape.dims) + cp_points_tensor = tf.gather_nd( + cp_tensor, tf.compat.v1.where(range_image_mask)) + points.append(points_tensor.numpy()) + cp_points.append(cp_points_tensor.numpy()) + + intensity_tensor = tf.gather_nd(range_image_tensor[..., 1], + tf.where(range_image_mask)) + intensity.append(intensity_tensor.numpy()) + + elongation_tensor = tf.gather_nd(range_image_tensor[..., 2], + tf.where(range_image_mask)) + elongation.append(elongation_tensor.numpy()) + + return points, cp_points, intensity, elongation + + def cart_to_homo(self, mat): + """Convert transformation matrix in Cartesian coordinates to + homogeneous format. + + Args: + mat (np.ndarray): Transformation matrix in Cartesian. + The input matrix shape is 3x3 or 3x4. + + Returns: + np.ndarray: Transformation matrix in homogeneous format. + The matrix shape is 4x4. + """ + ret = np.eye(4) + if mat.shape == (3, 3): + ret[:3, :3] = mat + elif mat.shape == (3, 4): + ret[:3, :] = mat + else: + raise ValueError(mat.shape) + return ret diff --git a/model_examples/MapTR/tools/dist_test.sh b/model_examples/MapTR/tools/dist_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..0fb6a1ca62072644ee98e35cf35b8753ba15a8a1 --- /dev/null +++ b/model_examples/MapTR/tools/dist_test.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +PORT=${PORT:-29503} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox diff --git a/model_examples/MapTR/tools/dist_test_map.sh b/model_examples/MapTR/tools/dist_test_map.sh new file mode 100644 index 0000000000000000000000000000000000000000..01ee3b73417caa6230bdcd47f25614c728db7c38 --- /dev/null +++ b/model_examples/MapTR/tools/dist_test_map.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +PORT=${PORT:-29503} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval chamfer diff --git a/model_examples/MapTR/tools/dist_train.sh b/model_examples/MapTR/tools/dist_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea94f6bf4e8ee27dc8d9fa62ea4d8ce982408d78 --- /dev/null +++ b/model_examples/MapTR/tools/dist_train.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +PORT=${PORT:-28509} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic diff --git a/model_examples/MapTR/tools/fp16/dist_train.sh b/model_examples/MapTR/tools/fp16/dist_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..4ac9a15c86100e210feeb6fb9432d88c34492d51 --- /dev/null +++ b/model_examples/MapTR/tools/fp16/dist_train.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +PORT=${PORT:-28508} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic diff --git a/model_examples/MapTR/tools/fp16/train.py b/model_examples/MapTR/tools/fp16/train.py new file mode 100644 index 0000000000000000000000000000000000000000..a6f0270b4ac45fd8aff3f70ba08a08f513f3b4da --- /dev/null +++ b/model_examples/MapTR/tools/fp16/train.py @@ -0,0 +1,271 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import division + +import argparse +import copy +import mmcv +import os +import time +import torch +import warnings +from mmcv import Config, DictAction +from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model +from os import path as osp + +from mmdet import __version__ as mmdet_version +from mmdet3d import __version__ as mmdet3d_version +#from mmdet3d.apis import train_model + +from mmdet3d.datasets import build_dataset +from mmdet3d.models import build_model +from mmdet3d.utils import collect_env, get_root_logger +from mmdet.apis import set_random_seed +from mmseg import __version__ as mmseg_version + +from mmcv.utils import TORCH_VERSION, digit_version + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument('--seed', type=int, default=0, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file (deprecate), ' + 'change to --cfg-options instead.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument( + '--autoscale-lr', + action='store_true', + help='automatically scale lr with the number of gpus') + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.cfg_options: + raise ValueError( + '--options and --cfg-options cannot be both specified, ' + '--options is deprecated in favor of --cfg-options') + if args.options: + warnings.warn('--options is deprecated in favor of --cfg-options') + args.cfg_options = args.options + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + from projects.mmdet3d_plugin.bevformer.apis import custom_train_model + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + #if args.resume_from is not None: + + if args.resume_from is not None and osp.isfile(args.resume_from): + cfg.resume_from = args.resume_from + + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + if digit_version(TORCH_VERSION) != digit_version('1.8.1'): + cfg.optimizer['type'] = 'AdamW' + if args.autoscale_lr: + # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) + cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + assert False, 'DOT NOT SUPPORT!!!' + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + # dump config + cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + # specify logger name, if we still use 'mmdet', the output info will be + # filtered and won't be saved in the log_file + # TODO: ugly workaround to judge whether we are training det or seg model + if cfg.model.type in ['EncoderDecoder3D']: + logger_name = 'mmseg' + else: + logger_name = 'mmdet' + logger = get_root_logger( + log_file=log_file, log_level=cfg.log_level, name=logger_name) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + meta['config'] = cfg.pretty_text + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + if args.seed is not None: + logger.info(f'Set random seed to {args.seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(args.seed, deterministic=args.deterministic) + cfg.seed = args.seed + meta['seed'] = args.seed + meta['exp_name'] = osp.basename(args.config) + + model = build_model( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + model.init_weights() + + eval_model_config = copy.deepcopy(cfg.model) + eval_model = build_model( + eval_model_config, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(eval_model) + + #eval_model.init_weights() + eval_model.load_state_dict(model.state_dict()) + + logger.info(f'Model:\n{model}') + from projects.mmdet3d_plugin.datasets import custom_build_dataset + datasets = [custom_build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + # in case we use a dataset wrapper + if 'dataset' in cfg.data.train: + val_dataset.pipeline = cfg.data.train.dataset.pipeline + else: + val_dataset.pipeline = cfg.data.train.pipeline + # set test_mode=False here in deep copied config + # which do not affect AP/AR calculation later + # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa + val_dataset.test_mode = False + datasets.append(custom_build_dataset(val_dataset)) + if cfg.checkpoint_config is not None: + # save mmdet version, config file content and class names in + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmdet_version=mmdet_version, + mmseg_version=mmseg_version, + mmdet3d_version=mmdet3d_version, + config=cfg.pretty_text, + CLASSES=datasets[0].CLASSES, + PALETTE=datasets[0].PALETTE # for segmentors + if hasattr(datasets[0], 'PALETTE') else None) + # add an attribute for visualization convenience + model.CLASSES = datasets[0].CLASSES + custom_train_model( + model, + datasets, + cfg, + eval_model=eval_model, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/maptr/benchmark.py b/model_examples/MapTR/tools/maptr/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..8c9f01d2e032702a4572eae6f659c98b094c5878 --- /dev/null +++ b/model_examples/MapTR/tools/maptr/benchmark.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import time +import torch +from mmcv import Config +from mmcv.parallel import MMDataParallel +from mmcv.runner import load_checkpoint, wrap_fp16_model +import sys +import os +sys.path.append('.') +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from projects.mmdet3d_plugin.datasets import custom_build_dataset +# from mmdet3d.datasets import build_dataloader, build_dataset +from mmdet3d.models import build_detector +#from tools.misc.fuse_conv_bn import fuse_module + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMDet benchmark a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('--checkpoint', default=None, help='checkpoint file') + parser.add_argument('--samples', default=2000, help='samples to benchmark') + parser.add_argument( + '--log-interval', default=50, help='interval of logging') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + cfg.model.pretrained = None + cfg.data.test.test_mode = True + + # build the dataloader + # TODO: support multiple images per gpu (only minor changes are needed) + print(cfg.data.test) + dataset = custom_build_dataset(cfg.data.test) + data_loader = build_dataloader( + dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=False, + shuffle=False) + + # build the model and load checkpoint + cfg.model.train_cfg = None + model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('***********number of params:', n_parameters) + + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + if args.checkpoint is not None: + load_checkpoint(model, args.checkpoint, map_location='cpu') + #if args.fuse_conv_bn: + # model = fuse_module(model) + + model = MMDataParallel(model, device_ids=[0]) + + model.eval() + + # the first several iterations may be very slow so skip them + num_warmup = 5 + pure_inf_time = 0 + + # benchmark with several samples and take the average + for i, data in enumerate(data_loader): + torch.cuda.synchronize() + start_time = time.perf_counter() + with torch.no_grad(): + model(return_loss=False, rescale=True, **data) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - start_time + + if i >= num_warmup: + pure_inf_time += elapsed + if (i + 1) % args.log_interval == 0: + fps = (i + 1 - num_warmup) / pure_inf_time + print(f'Done image [{i + 1:<3}/ {args.samples}], ' + f'fps: {fps:.1f} img / s') + + if (i + 1) == args.samples: + pure_inf_time += elapsed + fps = (i + 1 - num_warmup) / pure_inf_time + print(f'Overall fps: {fps:.1f} img / s') + break + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/maptr/generate_papervis.py b/model_examples/MapTR/tools/maptr/generate_papervis.py new file mode 100644 index 0000000000000000000000000000000000000000..79c884b369fddd44ab28697c546954af19d1b637 --- /dev/null +++ b/model_examples/MapTR/tools/maptr/generate_papervis.py @@ -0,0 +1,104 @@ +import os.path as osp +import argparse +import os +import glob +import cv2 +import mmcv +CAMS = ['FRONT_LEFT','FRONT','FRONT_RIGHT', + 'BACK_LEFT','BACK','BACK_RIGHT',] +VIEWS_NAME = 'surroud_view.jpg' +GT_MAP_NAME = 'GT_fixednum_pts_MAP.png' +PRED_MAP_NAME = 'PRED_MAP_plot.png' + +def parse_args(): + parser = argparse.ArgumentParser(description='vis hdmaptr map gt label') + parser.add_argument('visdir', help='visualize directory') + # parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--sample-name', default='SAMPLE_VIS.png', type=str) + args = parser.parse_args() + return args + +def main(): + args = parse_args() + parent_dir = osp.join(args.visdir,'..') + vis_subdir_list = [] + + file_list = os.listdir(args.visdir) + prog_bar = mmcv.ProgressBar(len(file_list)) + for file in file_list: + file_path = osp.join(args.visdir, file) + if os.path.isdir(file_path): + vis_subdir_list.append(file_path) + sample_path = osp.join(file_path,args.sample_name) + row_1_list = [] + for cam in CAMS[:3]: + cam_img_name = 'CAM_'+ cam + '.jpg' + cam_img = cv2.imread(osp.join(file_path, cam_img_name)) + # import pdb;pdb.set_trace() + lw = 8 + tf = max(lw - 1, 1) + w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0] # text width, height + p1 = (0,0) + p2 = (w,h+3) + color=(0, 0, 0) + txt_color=(255, 255, 255) + cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA) # filled + cv2.putText(cam_img, + cam, (p1[0], p1[1] + h + 2), + 0, + lw / 3, + txt_color, + thickness=tf, + lineType=cv2.LINE_AA) + row_1_list.append(cam_img) + row_2_list = [] + for cam in CAMS[3:]: + cam_img_name = 'CAM_'+ cam + '.jpg' + cam_img = cv2.imread(osp.join(file_path, cam_img_name)) + if cam == 'BACK': + cam_img = cv2.flip(cam_img, 1) + # import pdb;pdb.set_trace() + lw = 8 + tf = max(lw - 1, 1) + w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0] # text width, height + p1 = (0,0) + p2 = (w,h+3) + color=(0, 0, 0) + txt_color=(255, 255, 255) + cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA) # filled + cv2.putText(cam_img, + cam, (p1[0], p1[1] + h + 2), + 0, + lw / 3, + txt_color, + thickness=tf, + lineType=cv2.LINE_AA) + row_2_list.append(cam_img) + row_1_img=cv2.hconcat(row_1_list) + row_2_img=cv2.hconcat(row_2_list) + cams_img = cv2.vconcat([row_1_img,row_2_img]) + + + map_img = cv2.imread(osp.join(file_path,PRED_MAP_NAME)) + gt_map_img = cv2.imread(osp.join(file_path,GT_MAP_NAME)) + map_img = cv2.copyMakeBorder(map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0) + gt_map_img = cv2.copyMakeBorder(gt_map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0) + + cams_h,cam_w,_ = cams_img.shape + map_h,map_w,_ = map_img.shape + resize_ratio = cams_h / map_h + resized_w = map_w * resize_ratio + resized_map_img = cv2.resize(map_img,(int(resized_w),int(cams_h))) + resized_gt_map_img = cv2.resize(gt_map_img,(int(resized_w),int(cams_h))) + + + + sample_img = cv2.hconcat([cams_img, resized_map_img,resized_gt_map_img]) + cv2.imwrite(sample_path, sample_img) + prog_bar.update() + print('DONE!') + + +if __name__ == '__main__': + main() + diff --git a/model_examples/MapTR/tools/maptr/generate_video.py b/model_examples/MapTR/tools/maptr/generate_video.py new file mode 100644 index 0000000000000000000000000000000000000000..145af87c137d2979d770734064bd0e6e06dd8db3 --- /dev/null +++ b/model_examples/MapTR/tools/maptr/generate_video.py @@ -0,0 +1,131 @@ +import os.path as osp +import argparse +import os +import glob +import cv2 +import mmcv +CAMS = ['FRONT_LEFT','FRONT','FRONT_RIGHT', + 'BACK_LEFT','BACK','BACK_RIGHT',] +GT_MAP_NAME = 'GT_fixednum_pts_MAP.png' +PRED_MAP_NAME = 'PRED_MAP_plot.png' + +def parse_args(): + parser = argparse.ArgumentParser(description='vis hdmaptr map gt label') + parser.add_argument('visdir', help='visualize directory') + # parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--fps', default=10, type=int, help='fps to generate video') + parser.add_argument('--video-name', default='demo',type=str) + parser.add_argument('--sample-name', default='SAMPLE_VIS.jpg', type=str) + args = parser.parse_args() + return args + +def main(): + args = parse_args() + parent_dir = osp.join(args.visdir,'..') + vis_subdir_list = [] + # import pdb;pdb.set_trace() + size = (1680,450) + # fourcc = cv2.VideoWriter_fourcc(*'mp4v') + # fourcc = cv2.VideoWriter_fourcc(*'MP4V') + fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') + video_path = osp.join(parent_dir,'%s.mp4' % args.video_name) + video = cv2.VideoWriter(video_path, fourcc, args.fps, size, True) + file_list = os.listdir(args.visdir) + prog_bar = mmcv.ProgressBar(len(file_list)) + for file in file_list: + file_path = osp.join(args.visdir, file) + if os.path.isdir(file_path): + vis_subdir_list.append(file_path) + sample_path = osp.join(file_path,args.sample_name) + row_1_list = [] + for cam in CAMS[:3]: + cam_img_name = 'CAM_'+ cam + '.jpg' + cam_img = cv2.imread(osp.join(file_path, cam_img_name)) + lw = 8 + tf = max(lw - 1, 1) + w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0] # text width, height + p1 = (0,0) + p2 = (w,h+3) + color=(0, 0, 0) + txt_color=(255, 255, 255) + cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA) # filled + cv2.putText(cam_img, + cam, (p1[0], p1[1] + h + 2), + 0, + lw / 3, + txt_color, + thickness=tf, + lineType=cv2.LINE_AA) + row_1_list.append(cam_img) + row_2_list = [] + for cam in CAMS[3:]: + cam_img_name = 'CAM_'+cam + '.jpg' + cam_img = cv2.imread(osp.join(file_path, cam_img_name)) + if cam == 'BACK': + cam_img = cv2.flip(cam_img, 1) + lw = 8 + tf = max(lw - 1, 1) + w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0] # text width, height + p1 = (0,0) + p2 = (w,h+3) + color=(0, 0, 0) + txt_color=(255, 255, 255) + cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA) # filled + cv2.putText(cam_img, + cam, (p1[0], p1[1] + h + 2), + 0, + lw / 3, + txt_color, + thickness=tf, + lineType=cv2.LINE_AA) + row_2_list.append(cam_img) + row_1_img=cv2.hconcat(row_1_list) + row_2_img=cv2.hconcat(row_2_list) + cams_img = cv2.vconcat([row_1_img,row_2_img]) + + map_img = cv2.imread(osp.join(file_path,PRED_MAP_NAME)) + gt_map_img = cv2.imread(osp.join(file_path,GT_MAP_NAME)) + + map_img = cv2.copyMakeBorder(map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0) + gt_map_img = cv2.copyMakeBorder(gt_map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0) + + + cams_h,cam_w,_ = cams_img.shape + map_h,map_w,_ = map_img.shape + resize_ratio = cams_h / map_h + resized_w = map_w * resize_ratio + resized_map_img = cv2.resize(map_img,(int(resized_w),int(cams_h))) + resized_gt_map_img = cv2.resize(gt_map_img,(int(resized_w),int(cams_h))) + + # font + font = cv2.FONT_HERSHEY_SIMPLEX + # fontScale + fontScale = 2 + # Line thickness of 2 px + thickness = 5 + # org + org = (20, 50) + # Blue color in BGR + color = (0, 0, 255) + # Using cv2.putText() method + resized_map_img = cv2.putText(resized_map_img, 'PRED', org, font, + fontScale, color, thickness, cv2.LINE_AA) + resized_gt_map_img = cv2.putText(resized_gt_map_img, 'GT', org, font, + fontScale, color, thickness, cv2.LINE_AA) + + sample_img = cv2.hconcat([cams_img, resized_map_img, resized_gt_map_img]) + cv2.imwrite(sample_path, sample_img,[cv2.IMWRITE_JPEG_QUALITY, 70]) + # import pdb;pdb.set_trace() + resized_img = cv2.resize(sample_img,size) + + video.write(resized_img) + prog_bar.update() + # import pdb;pdb.set_trace() + video.release() + + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() + diff --git a/model_examples/MapTR/tools/maptr/test.py b/model_examples/MapTR/tools/maptr/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4a73e83ac8396424e93f8ac33d5f9ec12c64350e --- /dev/null +++ b/model_examples/MapTR/tools/maptr/test.py @@ -0,0 +1,220 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import argparse +import mmcv +import os +import torch +import warnings +from mmcv import Config, DictAction +from mmcv.cnn import fuse_conv_bn +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, + wrap_fp16_model) + +from mmdet3d.apis import single_gpu_test +from mmdet3d.datasets import build_dataset +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from mmdet3d.models import build_model +from mmdet.apis import set_random_seed +from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test +from mmdet.datasets import replace_ImageToTensor +import time +import os.path as osp + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('respath', help='respath file') + parser.add_argument('--out', help='output result file in pickle format') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + parser.add_argument( + '--format-only', + action='store_true', + help='Format the output results without perform evaluation. It is' + 'useful when you want to format the result to a specific format and ' + 'submit it to the test server') + parser.add_argument( + '--eval', + type=str, + nargs='+', + help='evaluation metrics, which depends on the dataset, e.g., "bbox",' + ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') + parser.add_argument('--show', action='store_true', help='show results') + parser.add_argument( + '--show-dir', help='directory where results will be saved') + parser.add_argument( + '--gpu-collect', + action='store_true', + help='whether to use gpu to collect results.') + parser.add_argument( + '--tmpdir', + help='tmp directory used for collecting results from multiple ' + 'workers, available when gpu-collect is not specified') + parser.add_argument('--seed', type=int, default=0, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function (deprecate), ' + 'change to --eval-options instead.') + parser.add_argument( + '--eval-options', + nargs='+', + action=DictAction, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.eval_options: + raise ValueError( + '--options and --eval-options cannot be both specified, ' + '--options is deprecated in favor of --eval-options') + if args.options: + warnings.warn('--options is deprecated in favor of --eval-options') + args.eval_options = args.options + return args + + +def main(): + args = parse_args() + + assert args.out or args.eval or args.format_only or args.show \ + or args.show_dir, \ + ('Please specify at least one operation (save/eval/format/show the ' + 'results / save the results) with the argument "--out", "--eval"' + ', "--format-only", "--show" or "--show-dir"') + + if args.eval and args.format_only: + raise ValueError('--eval and --format_only cannot be both specified') + + if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): + raise ValueError('The output file must be a pkl file.') + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + cfg.model.pretrained = None + # in case the test dataset is concatenated + samples_per_gpu = 1 + if isinstance(cfg.data.test, dict): + cfg.data.test.test_mode = True + samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) + if samples_per_gpu > 1: + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.test.pipeline = replace_ImageToTensor( + cfg.data.test.pipeline) + elif isinstance(cfg.data.test, list): + for ds_cfg in cfg.data.test: + ds_cfg.test_mode = True + samples_per_gpu = max( + [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) + if samples_per_gpu > 1: + for ds_cfg in cfg.data.test: + ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + + # set random seeds + if args.seed is not None: + set_random_seed(args.seed, deterministic=args.deterministic) + + # build the dataloader + dataset = build_dataset(cfg.data.test) + data_loader = build_dataloader( + dataset, + samples_per_gpu=samples_per_gpu, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False, + nonshuffler_sampler=cfg.data.nonshuffler_sampler, + ) + + rank, _ = get_dist_info() + if rank == 0: + + if args.eval: + eval_kwargs = cfg.get('evaluation', {}).copy() + # hard-code way to remove EvalHook args + for key in [ + 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', + 'rule' + ]: + eval_kwargs.pop(key, None) + kwargs = {} + eval_kwargs.update(dict(metric=args.eval, **kwargs)) + + print(dataset._evaluate_single(args.respath, metric=args.eval)) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/maptr/utils/__init__.py b/model_examples/MapTR/tools/maptr/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model_examples/MapTR/tools/maptr/vis_pred.py b/model_examples/MapTR/tools/maptr/vis_pred.py new file mode 100644 index 0000000000000000000000000000000000000000..84ccbb321b14c3fd7bffc8430ca60cf36a51c25f --- /dev/null +++ b/model_examples/MapTR/tools/maptr/vis_pred.py @@ -0,0 +1,394 @@ +import argparse +import mmcv +import os +import shutil +import torch +import warnings +from mmcv import Config, DictAction +from mmcv.cnn import fuse_conv_bn +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, + wrap_fp16_model) +from mmdet3d.utils import collect_env, get_root_logger +from mmdet3d.apis import single_gpu_test +from mmdet3d.datasets import build_dataset +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from mmdet3d.models import build_model +from mmdet.apis import set_random_seed +from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test +from mmdet.datasets import replace_ImageToTensor +import time +import os.path as osp +import numpy as np +from PIL import Image +import matplotlib.pyplot as plt +from matplotlib import transforms +from matplotlib.patches import Rectangle +import cv2 + +CAMS = ['CAM_FRONT_LEFT','CAM_FRONT','CAM_FRONT_RIGHT', + 'CAM_BACK_LEFT','CAM_BACK','CAM_BACK_RIGHT',] +# we choose these samples not because it is easy but because it is hard +CANDIDATE=['n008-2018-08-01-15-16-36-0400_1533151184047036', + 'n008-2018-08-01-15-16-36-0400_1533151200646853', + 'n008-2018-08-01-15-16-36-0400_1533151274047332', + 'n008-2018-08-01-15-16-36-0400_1533151369947807', + 'n008-2018-08-01-15-16-36-0400_1533151581047647', + 'n008-2018-08-01-15-16-36-0400_1533151585447531', + 'n008-2018-08-01-15-16-36-0400_1533151741547700', + 'n008-2018-08-01-15-16-36-0400_1533151854947676', + 'n008-2018-08-22-15-53-49-0400_1534968048946931', + 'n008-2018-08-22-15-53-49-0400_1534968255947662', + 'n008-2018-08-01-15-16-36-0400_1533151616447606', + 'n015-2018-07-18-11-41-49+0800_1531885617949602', + 'n008-2018-08-28-16-43-51-0400_1535489136547616', + 'n008-2018-08-28-16-43-51-0400_1535489145446939', + 'n008-2018-08-28-16-43-51-0400_1535489152948944', + 'n008-2018-08-28-16-43-51-0400_1535489299547057', + 'n008-2018-08-28-16-43-51-0400_1535489317946828', + 'n008-2018-09-18-15-12-01-0400_1537298038950431', + 'n008-2018-09-18-15-12-01-0400_1537298047650680', + 'n008-2018-09-18-15-12-01-0400_1537298056450495', + 'n008-2018-09-18-15-12-01-0400_1537298074700410', + 'n008-2018-09-18-15-12-01-0400_1537298088148941', + 'n008-2018-09-18-15-12-01-0400_1537298101700395', + 'n015-2018-11-21-19-21-35+0800_1542799330198603', + 'n015-2018-11-21-19-21-35+0800_1542799345696426', + 'n015-2018-11-21-19-21-35+0800_1542799353697765', + 'n015-2018-11-21-19-21-35+0800_1542799525447813', + 'n015-2018-11-21-19-21-35+0800_1542799676697935', + 'n015-2018-11-21-19-21-35+0800_1542799758948001', + ] + +def perspective(cam_coords, proj_mat): + pix_coords = proj_mat @ cam_coords + valid_idx = pix_coords[2, :] > 0 + pix_coords = pix_coords[:, valid_idx] + pix_coords = pix_coords[:2, :] / (pix_coords[2, :] + 1e-7) + pix_coords = pix_coords.transpose(1, 0) + return pix_coords + +def parse_args(): + parser = argparse.ArgumentParser(description='vis hdmaptr map gt label') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--score-thresh', default=0.4, type=float, help='samples to visualize') + parser.add_argument( + '--show-dir', help='directory where visualizations will be saved') + parser.add_argument('--show-cam', action='store_true', help='show camera pic') + parser.add_argument( + '--gt-format', + type=str, + nargs='+', + default=['fixed_num_pts',], + help='vis format, default should be "points",' + 'support ["se_pts","bbox","fixed_num_pts","polyline_pts"]') + args = parser.parse_args() + return args + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + cfg.model.pretrained = None + # in case the test dataset is concatenated + samples_per_gpu = 1 + if isinstance(cfg.data.test, dict): + cfg.data.test.test_mode = True + samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) + if samples_per_gpu > 1: + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.test.pipeline = replace_ImageToTensor( + cfg.data.test.pipeline) + elif isinstance(cfg.data.test, list): + for ds_cfg in cfg.data.test: + ds_cfg.test_mode = True + samples_per_gpu = max( + [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) + if samples_per_gpu > 1: + for ds_cfg in cfg.data.test: + ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) + + if args.show_dir is None: + args.show_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0], + 'vis_pred') + # create vis_label dir + mmcv.mkdir_or_exist(osp.abspath(args.show_dir)) + cfg.dump(osp.join(args.show_dir, osp.basename(args.config))) + logger = get_root_logger() + logger.info(f'DONE create vis_pred dir: {args.show_dir}') + + + dataset = build_dataset(cfg.data.test) + dataset.is_vis_on_test = True #TODO, this is a hack + data_loader = build_dataloader( + dataset, + samples_per_gpu=samples_per_gpu, + # workers_per_gpu=cfg.data.workers_per_gpu, + workers_per_gpu=0, + dist=False, + shuffle=False, + nonshuffler_sampler=cfg.data.nonshuffler_sampler, + ) + logger.info('Done build test data set') + + # build the model and load checkpoint + # import pdb;pdb.set_trace() + cfg.model.train_cfg = None + # cfg.model.pts_bbox_head.bbox_coder.max_num=15 # TODO this is a hack + model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + logger.info('loading check point') + checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') + if 'CLASSES' in checkpoint.get('meta', {}): + model.CLASSES = checkpoint['meta']['CLASSES'] + else: + model.CLASSES = dataset.CLASSES + # palette for visualization in segmentation tasks + if 'PALETTE' in checkpoint.get('meta', {}): + model.PALETTE = checkpoint['meta']['PALETTE'] + elif hasattr(dataset, 'PALETTE'): + # segmentation dataset has `PALETTE` attribute + model.PALETTE = dataset.PALETTE + logger.info('DONE load check point') + model = MMDataParallel(model, device_ids=[0]) + model.eval() + + img_norm_cfg = cfg.img_norm_cfg + + # get denormalized param + mean = np.array(img_norm_cfg['mean'],dtype=np.float32) + std = np.array(img_norm_cfg['std'],dtype=np.float32) + to_bgr = img_norm_cfg['to_rgb'] + + # get pc_range + pc_range = cfg.point_cloud_range + + # get car icon + with Image.open('./figs/lidar_car.png') as car_img: + + # get color map: divider->r, ped->b, boundary->g + colors_plt = ['orange', 'b', 'g'] + + + + logger.info('BEGIN vis test dataset samples gt label & pred') + + + + bbox_results = [] + mask_results = [] + dataset = data_loader.dataset + have_mask = False + # prog_bar = mmcv.ProgressBar(len(CANDIDATE)) + prog_bar = mmcv.ProgressBar(len(dataset)) + # import pdb;pdb.set_trace() + for i, data in enumerate(data_loader): + if ~(data['gt_labels_3d'].data[0][0] != -1).any(): + # import pdb;pdb.set_trace() + logger.error(f'\n empty gt for index {i}, continue') + # prog_bar.update() + continue + + + img = data['img'][0].data[0] + img_metas = data['img_metas'][0].data[0] + gt_bboxes_3d = data['gt_bboxes_3d'].data[0] + gt_labels_3d = data['gt_labels_3d'].data[0] + + pts_filename = img_metas[0]['pts_filename'] + pts_filename = osp.basename(pts_filename) + pts_filename = pts_filename.replace('__LIDAR_TOP__', '_').split('.')[0] + # import pdb;pdb.set_trace() + # if pts_filename not in CANDIDATE: + # continue + + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + sample_dir = osp.join(args.show_dir, pts_filename) + mmcv.mkdir_or_exist(osp.abspath(sample_dir)) + + filename_list = img_metas[0]['filename'] + img_path_dict = {} + # save cam img for sample + for filepath in filename_list: + filename = osp.basename(filepath) + filename_splits = filename.split('__') + # sample_dir = filename_splits[0] + # sample_dir = osp.join(args.show_dir, sample_dir) + # mmcv.mkdir_or_exist(osp.abspath(sample_dir)) + img_name = filename_splits[1] + '.jpg' + img_path = osp.join(sample_dir,img_name) + # img_path_list.append(img_path) + shutil.copyfile(filepath,img_path) + img_path_dict[filename_splits[1]] = img_path + + # surrounding view + row_1_list = [] + for cam in CAMS[:3]: + cam_img_name = cam + '.jpg' + cam_img = cv2.imread(osp.join(sample_dir, cam_img_name)) + row_1_list.append(cam_img) + row_2_list = [] + for cam in CAMS[3:]: + cam_img_name = cam + '.jpg' + cam_img = cv2.imread(osp.join(sample_dir, cam_img_name)) + row_2_list.append(cam_img) + row_1_img=cv2.hconcat(row_1_list) + row_2_img=cv2.hconcat(row_2_list) + cams_img = cv2.vconcat([row_1_img,row_2_img]) + cams_img_path = osp.join(sample_dir,'surroud_view.jpg') + cv2.imwrite(cams_img_path, cams_img,[cv2.IMWRITE_JPEG_QUALITY, 70]) + + for vis_format in args.gt_format: + if vis_format == 'se_pts': + gt_line_points = gt_bboxes_3d[0].start_end_points + for gt_bbox_3d, gt_label_3d in zip(gt_line_points, gt_labels_3d[0]): + pts = gt_bbox_3d.reshape(-1,2).numpy() + x = np.array([pt[0] for pt in pts]) + y = np.array([pt[1] for pt in pts]) + plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], scale_units='xy', angles='xy', scale=1, color=colors_plt[gt_label_3d]) + elif vis_format == 'bbox': + gt_lines_bbox = gt_bboxes_3d[0].bbox + for gt_bbox_3d, gt_label_3d in zip(gt_lines_bbox, gt_labels_3d[0]): + gt_bbox_3d = gt_bbox_3d.numpy() + xy = (gt_bbox_3d[0],gt_bbox_3d[1]) + width = gt_bbox_3d[2] - gt_bbox_3d[0] + height = gt_bbox_3d[3] - gt_bbox_3d[1] + # import pdb;pdb.set_trace() + plt.gca().add_patch(Rectangle(xy,width,height,linewidth=0.4,edgecolor=colors_plt[gt_label_3d],facecolor='none')) + # plt.Rectangle(xy, width, height,color=colors_plt[gt_label_3d]) + # continue + elif vis_format == 'fixed_num_pts': + plt.figure(figsize=(2, 4)) + plt.xlim(pc_range[0], pc_range[3]) + plt.ylim(pc_range[1], pc_range[4]) + plt.axis('off') + # gt_bboxes_3d[0].fixed_num=30 #TODO, this is a hack + gt_lines_fixed_num_pts = gt_bboxes_3d[0].fixed_num_sampled_points + for gt_bbox_3d, gt_label_3d in zip(gt_lines_fixed_num_pts, gt_labels_3d[0]): + # import pdb;pdb.set_trace() + pts = gt_bbox_3d.numpy() + x = np.array([pt[0] for pt in pts]) + y = np.array([pt[1] for pt in pts]) + # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], scale_units='xy', angles='xy', scale=1, color=colors_plt[gt_label_3d]) + + + plt.plot(x, y, color=colors_plt[gt_label_3d],linewidth=1,alpha=0.8,zorder=-1) + plt.scatter(x, y, color=colors_plt[gt_label_3d],s=2,alpha=0.8,zorder=-1) + # plt.plot(x, y, color=colors_plt[gt_label_3d]) + # plt.scatter(x, y, color=colors_plt[gt_label_3d],s=1) + plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5]) + + gt_fixedpts_map_path = osp.join(sample_dir, 'GT_fixednum_pts_MAP.png') + plt.savefig(gt_fixedpts_map_path, bbox_inches='tight', format='png',dpi=1200) + plt.close() + elif vis_format == 'polyline_pts': + plt.figure(figsize=(2, 4)) + plt.xlim(pc_range[0], pc_range[3]) + plt.ylim(pc_range[1], pc_range[4]) + plt.axis('off') + gt_lines_instance = gt_bboxes_3d[0].instance_list + # import pdb;pdb.set_trace() + for gt_line_instance, gt_label_3d in zip(gt_lines_instance, gt_labels_3d[0]): + pts = np.array(list(gt_line_instance.coords)) + x = np.array([pt[0] for pt in pts]) + y = np.array([pt[1] for pt in pts]) + + # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], scale_units='xy', angles='xy', scale=1, color=colors_plt[gt_label_3d]) + + # plt.plot(x, y, color=colors_plt[gt_label_3d]) + plt.plot(x, y, color=colors_plt[gt_label_3d],linewidth=1,alpha=0.8,zorder=-1) + plt.scatter(x, y, color=colors_plt[gt_label_3d],s=1,alpha=0.8,zorder=-1) + plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5]) + + gt_polyline_map_path = osp.join(sample_dir, 'GT_polyline_pts_MAP.png') + plt.savefig(gt_polyline_map_path, bbox_inches='tight', format='png',dpi=1200) + plt.close() + + else: + logger.error(f'WRONG visformat for GT: {vis_format}') + raise ValueError(f'WRONG visformat for GT: {vis_format}') + + + # import pdb;pdb.set_trace() + plt.figure(figsize=(2, 4)) + plt.xlim(pc_range[0], pc_range[3]) + plt.ylim(pc_range[1], pc_range[4]) + plt.axis('off') + + # visualize pred + # import pdb;pdb.set_trace() + result_dic = result[0]['pts_bbox'] + boxes_3d = result_dic['boxes_3d'] # bbox: xmin, ymin, xmax, ymax + scores_3d = result_dic['scores_3d'] + labels_3d = result_dic['labels_3d'] + pts_3d = result_dic['pts_3d'] + keep = scores_3d > args.score_thresh + + plt.figure(figsize=(2, 4)) + plt.xlim(pc_range[0], pc_range[3]) + plt.ylim(pc_range[1], pc_range[4]) + plt.axis('off') + for pred_score_3d, pred_bbox_3d, pred_label_3d, pred_pts_3d in zip(scores_3d[keep], boxes_3d[keep],labels_3d[keep], pts_3d[keep]): + + pred_pts_3d = pred_pts_3d.numpy() + pts_x = pred_pts_3d[:,0] + pts_y = pred_pts_3d[:,1] + plt.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=1,alpha=0.8,zorder=-1) + plt.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1) + + + pred_bbox_3d = pred_bbox_3d.numpy() + xy = (pred_bbox_3d[0],pred_bbox_3d[1]) + width = pred_bbox_3d[2] - pred_bbox_3d[0] + height = pred_bbox_3d[3] - pred_bbox_3d[1] + pred_score_3d = float(pred_score_3d) + pred_score_3d = round(pred_score_3d, 2) + s = str(pred_score_3d) + + + + plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5]) + + map_path = osp.join(sample_dir, 'PRED_MAP_plot.png') + plt.savefig(map_path, bbox_inches='tight', format='png',dpi=1200) + plt.close() + + + prog_bar.update() + + logger.info('\n DONE vis test dataset samples gt label & pred') +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/misc/browse_dataset.py b/model_examples/MapTR/tools/misc/browse_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e3419f66df56679088469a842cd62e31906df8a1 --- /dev/null +++ b/model_examples/MapTR/tools/misc/browse_dataset.py @@ -0,0 +1,240 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import numpy as np +import warnings +from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress +from os import path as osp + +from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, + DepthInstance3DBoxes, LiDARInstance3DBoxes) +from mmdet3d.core.visualizer import (show_multi_modality_result, show_result, + show_seg_result) +from mmdet3d.datasets import build_dataset + + +def parse_args(): + parser = argparse.ArgumentParser(description='Browse a dataset') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--skip-type', + type=str, + nargs='+', + default=['Normalize'], + help='skip some useless pipeline') + parser.add_argument( + '--output-dir', + default=None, + type=str, + help='If there is no display interface, you can save it') + parser.add_argument( + '--task', + type=str, + choices=['det', 'seg', 'multi_modality-det', 'mono-det'], + help='Determine the visualization method depending on the task.') + parser.add_argument( + '--online', + action='store_true', + help='Whether to perform online visualization. Note that you often ' + 'need a monitor to do so.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def build_data_cfg(config_path, skip_type, cfg_options): + """Build data config for loading visualization data.""" + cfg = Config.fromfile(config_path) + if cfg_options is not None: + cfg.merge_from_dict(cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + # extract inner dataset of `RepeatDataset` as `cfg.data.train` + # so we don't need to worry about it later + if cfg.data.train['type'] == 'RepeatDataset': + cfg.data.train = cfg.data.train.dataset + # use only first dataset for `ConcatDataset` + if cfg.data.train['type'] == 'ConcatDataset': + cfg.data.train = cfg.data.train.datasets[0] + train_data_cfg = cfg.data.train + # eval_pipeline purely consists of loading functions + # use eval_pipeline for data loading + train_data_cfg['pipeline'] = [ + x for x in cfg.eval_pipeline if x['type'] not in skip_type + ] + + return cfg + + +def to_depth_mode(points, bboxes): + """Convert points and bboxes to Depth Coord and Depth Box mode.""" + if points is not None: + points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR, + Coord3DMode.DEPTH) + if bboxes is not None: + bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR, + Box3DMode.DEPTH) + return points, bboxes + + +def show_det_data(idx, dataset, out_dir, filename, show=False): + """Visualize 3D point cloud and 3D bboxes.""" + example = dataset.prepare_train_data(idx) + points = example['points']._data.numpy() + gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor + if dataset.box_mode_3d != Box3DMode.DEPTH: + points, gt_bboxes = to_depth_mode(points, gt_bboxes) + show_result( + points, + gt_bboxes.clone(), + None, + out_dir, + filename, + show=show, + snapshot=True) + + +def show_seg_data(idx, dataset, out_dir, filename, show=False): + """Visualize 3D point cloud and segmentation mask.""" + example = dataset.prepare_train_data(idx) + points = example['points']._data.numpy() + gt_seg = example['pts_semantic_mask']._data.numpy() + show_seg_result( + points, + gt_seg.copy(), + None, + out_dir, + filename, + np.array(dataset.PALETTE), + dataset.ignore_index, + show=show, + snapshot=True) + + +def show_proj_bbox_img(idx, + dataset, + out_dir, + filename, + show=False, + is_nus_mono=False): + """Visualize 3D bboxes on 2D image by projection.""" + try: + example = dataset.prepare_train_data(idx) + except AttributeError: # for Mono-3D datasets + example = dataset.prepare_train_img(idx) + gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'] + img_metas = example['img_metas']._data + img = example['img']._data.numpy() + # need to transpose channel to first dim + img = img.transpose(1, 2, 0) + # no 3D gt bboxes, just show img + if gt_bboxes.tensor.shape[0] == 0: + gt_bboxes = None + if isinstance(gt_bboxes, DepthInstance3DBoxes): + show_multi_modality_result( + img, + gt_bboxes, + None, + None, + out_dir, + filename, + box_mode='depth', + img_metas=img_metas, + show=show) + elif isinstance(gt_bboxes, LiDARInstance3DBoxes): + show_multi_modality_result( + img, + gt_bboxes, + None, + img_metas['lidar2img'], + out_dir, + filename, + box_mode='lidar', + img_metas=img_metas, + show=show) + elif isinstance(gt_bboxes, CameraInstance3DBoxes): + show_multi_modality_result( + img, + gt_bboxes, + None, + img_metas['cam2img'], + out_dir, + filename, + box_mode='camera', + img_metas=img_metas, + show=show) + else: + # can't project, just show img + warnings.warn( + f'unrecognized gt box type {type(gt_bboxes)}, only show image') + show_multi_modality_result( + img, None, None, None, out_dir, filename, show=show) + + +def main(): + args = parse_args() + + if args.output_dir is not None: + mkdir_or_exist(args.output_dir) + + cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options) + try: + dataset = build_dataset( + cfg.data.train, default_args=dict(filter_empty_gt=False)) + except TypeError: # seg dataset doesn't have `filter_empty_gt` key + dataset = build_dataset(cfg.data.train) + data_infos = dataset.data_infos + dataset_type = cfg.dataset_type + + # configure visualization mode + vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det' + + for idx, data_info in enumerate(track_iter_progress(data_infos)): + if dataset_type in ['KittiDataset', 'WaymoDataset']: + data_path = data_info['point_cloud']['velodyne_path'] + elif dataset_type in [ + 'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset', + 'S3DISSegDataset', 'S3DISDataset' + ]: + data_path = data_info['pts_path'] + elif dataset_type in ['NuScenesDataset', 'LyftDataset']: + data_path = data_info['lidar_path'] + elif dataset_type in ['NuScenesMonoDataset']: + data_path = data_info['file_name'] + else: + raise NotImplementedError( + f'unsupported dataset type {dataset_type}') + + file_name = osp.splitext(osp.basename(data_path))[0] + + if vis_task in ['det', 'multi_modality-det']: + # show 3D bboxes on 3D point clouds + show_det_data( + idx, dataset, args.output_dir, file_name, show=args.online) + if vis_task in ['multi_modality-det', 'mono-det']: + # project 3D bboxes to 2D image + show_proj_bbox_img( + idx, + dataset, + args.output_dir, + file_name, + show=args.online, + is_nus_mono=(dataset_type == 'NuScenesMonoDataset')) + elif vis_task in ['seg']: + # show 3D segmentation mask on 3D point clouds + show_seg_data( + idx, dataset, args.output_dir, file_name, show=args.online) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/misc/fuse_conv_bn.py b/model_examples/MapTR/tools/misc/fuse_conv_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..d4e22018d66d3bd47119522e9da2ea6676ba5760 --- /dev/null +++ b/model_examples/MapTR/tools/misc/fuse_conv_bn.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import torch +from mmcv.runner import save_checkpoint +from torch import nn as nn + +from mmdet.apis import init_model + + +def fuse_conv_bn(conv, bn): + """During inference, the functionary of batch norm layers is turned off but + only the mean and var alone channels are used, which exposes the chance to + fuse it with the preceding conv layers to save computations and simplify + network structures.""" + conv_w = conv.weight + conv_b = conv.bias if conv.bias is not None else torch.zeros_like( + bn.running_mean) + + factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) + conv.weight = nn.Parameter(conv_w * + factor.reshape([conv.out_channels, 1, 1, 1])) + conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) + return conv + + +def fuse_module(m): + last_conv = None + last_conv_name = None + + for name, child in m.named_children(): + if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)): + if last_conv is None: # only fuse BN that is after Conv + continue + fused_conv = fuse_conv_bn(last_conv, child) + m._modules[last_conv_name] = fused_conv + # To reduce changes, set BN as Identity instead of deleting it. + m._modules[name] = nn.Identity() + last_conv = None + elif isinstance(child, nn.Conv2d): + last_conv = child + last_conv_name = name + else: + fuse_module(child) + return m + + +def parse_args(): + parser = argparse.ArgumentParser( + description='fuse Conv and BN layers in a model') + parser.add_argument('config', help='config file path') + parser.add_argument('checkpoint', help='checkpoint file path') + parser.add_argument('out', help='output path of the converted model') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + # build the model from a config file and a checkpoint file + model = init_model(args.config, args.checkpoint) + # fuse conv and bn layers of the model + fused_model = fuse_module(model) + save_checkpoint(fused_model, args.out) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/misc/print_config.py b/model_examples/MapTR/tools/misc/print_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3100fc324b375330ba10316d71405c535d91fb7b --- /dev/null +++ b/model_examples/MapTR/tools/misc/print_config.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from mmcv import Config, DictAction + + +def parse_args(): + parser = argparse.ArgumentParser(description='Print the whole config') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--options', nargs='+', action=DictAction, help='arguments in dict') + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.options is not None: + cfg.merge_from_dict(args.options) + print(f'Config:\n{cfg.pretty_text}') + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/misc/visualize_results.py b/model_examples/MapTR/tools/misc/visualize_results.py new file mode 100644 index 0000000000000000000000000000000000000000..302adc50eca960a6660104b33521d438cf54faa0 --- /dev/null +++ b/model_examples/MapTR/tools/misc/visualize_results.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import mmcv +from mmcv import Config + +from mmdet3d.datasets import build_dataset + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet3D visualize the results') + parser.add_argument('config', help='test config file path') + parser.add_argument('--result', help='results file in pickle format') + parser.add_argument( + '--show-dir', help='directory where visualize results will be saved') + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + if args.result is not None and \ + not args.result.endswith(('.pkl', '.pickle')): + raise ValueError('The results file must be a pkl file.') + + cfg = Config.fromfile(args.config) + cfg.data.test.test_mode = True + + # build the dataset + dataset = build_dataset(cfg.data.test) + results = mmcv.load(args.result) + + if getattr(dataset, 'show', None) is not None: + # data loading pipeline for showing + eval_pipeline = cfg.get('eval_pipeline', {}) + if eval_pipeline: + dataset.show(results, args.show_dir, pipeline=eval_pipeline) + else: + dataset.show(results, args.show_dir) # use default pipeline + else: + raise NotImplementedError( + 'Show is not implemented for dataset {}!'.format( + type(dataset).__name__)) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/model_converters/convert_votenet_checkpoints.py b/model_examples/MapTR/tools/model_converters/convert_votenet_checkpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..33792b00ddd96790acdcdf6ba9d8caf9da39b637 --- /dev/null +++ b/model_examples/MapTR/tools/model_converters/convert_votenet_checkpoints.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import tempfile +import torch +from mmcv import Config +from mmcv.runner import load_state_dict + +from mmdet3d.models import build_detector + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet3D upgrade model version(before v0.6.0) of VoteNet') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--out', help='path of the output checkpoint file') + args = parser.parse_args() + return args + + +def parse_config(config_strings): + """Parse config from strings. + + Args: + config_strings (string): strings of model config. + + Returns: + Config: model config + """ + temp_file = tempfile.NamedTemporaryFile() + config_path = f'{temp_file.name}.py' + with open(config_path, 'w') as f: + f.write(config_strings) + + config = Config.fromfile(config_path) + + # Update backbone config + if 'pool_mod' in config.model.backbone: + config.model.backbone.pop('pool_mod') + + if 'sa_cfg' not in config.model.backbone: + config.model.backbone['sa_cfg'] = dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True) + + if 'type' not in config.model.bbox_head.vote_aggregation_cfg: + config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule' + + # Update bbox_head config + if 'pred_layer_cfg' not in config.model.bbox_head: + config.model.bbox_head['pred_layer_cfg'] = dict( + in_channels=128, shared_conv_channels=(128, 128), bias=True) + + if 'feat_channels' in config.model.bbox_head: + config.model.bbox_head.pop('feat_channels') + + if 'vote_moudule_cfg' in config.model.bbox_head: + config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop( + 'vote_moudule_cfg') + + if config.model.bbox_head.vote_aggregation_cfg.use_xyz: + config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3 + + temp_file.close() + + return config + + +def main(): + """Convert keys in checkpoints for VoteNet. + + There can be some breaking changes during the development of mmdetection3d, + and this tool is used for upgrading checkpoints trained with old versions + (before v0.6.0) to the latest one. + """ + args = parse_args() + checkpoint = torch.load(args.checkpoint) + cfg = parse_config(checkpoint['meta']['config']) + # Build the model and load checkpoint + model = build_detector( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + orig_ckpt = checkpoint['state_dict'] + converted_ckpt = orig_ckpt.copy() + + if cfg['dataset_type'] == 'ScanNetDataset': + NUM_CLASSES = 18 + elif cfg['dataset_type'] == 'SUNRGBDDataset': + NUM_CLASSES = 10 + else: + raise NotImplementedError + + RENAME_PREFIX = { + 'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0', + 'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1' + } + + DEL_KEYS = [ + 'bbox_head.conv_pred.0.bn.num_batches_tracked', + 'bbox_head.conv_pred.1.bn.num_batches_tracked' + ] + + EXTRACT_KEYS = { + 'bbox_head.conv_pred.conv_cls.weight': + ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]), + 'bbox_head.conv_pred.conv_cls.bias': + ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]), + 'bbox_head.conv_pred.conv_reg.weight': + ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]), + 'bbox_head.conv_pred.conv_reg.bias': + ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)]) + } + + # Delete some useless keys + for key in DEL_KEYS: + converted_ckpt.pop(key) + + # Rename keys with specific prefix + RENAME_KEYS = dict() + for old_key in converted_ckpt.keys(): + for rename_prefix in RENAME_PREFIX.keys(): + if rename_prefix in old_key: + new_key = old_key.replace(rename_prefix, + RENAME_PREFIX[rename_prefix]) + RENAME_KEYS[new_key] = old_key + for new_key, old_key in RENAME_KEYS.items(): + converted_ckpt[new_key] = converted_ckpt.pop(old_key) + + # Extract weights and rename the keys + for new_key, (old_key, indices) in EXTRACT_KEYS.items(): + cur_layers = orig_ckpt[old_key] + converted_layers = [] + for (start, end) in indices: + if end != -1: + converted_layers.append(cur_layers[start:end]) + else: + converted_layers.append(cur_layers[start:]) + converted_layers = torch.cat(converted_layers, 0) + converted_ckpt[new_key] = converted_layers + if old_key in converted_ckpt.keys(): + converted_ckpt.pop(old_key) + + # Check the converted checkpoint by loading to the model + load_state_dict(model, converted_ckpt, strict=True) + checkpoint['state_dict'] = converted_ckpt + torch.save(checkpoint, args.out) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/model_converters/publish_model.py b/model_examples/MapTR/tools/model_converters/publish_model.py new file mode 100644 index 0000000000000000000000000000000000000000..318fd46a65894575f5f3e915672b18d24ba133d8 --- /dev/null +++ b/model_examples/MapTR/tools/model_converters/publish_model.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import subprocess +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Process a checkpoint to be published') + parser.add_argument('in_file', help='input checkpoint filename') + parser.add_argument('out_file', help='output checkpoint filename') + args = parser.parse_args() + return args + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + # remove optimizer for smaller file size + if 'optimizer' in checkpoint: + del checkpoint['optimizer'] + # if it is necessary to remove some sensitive data in checkpoint['meta'], + # add the code here. + torch.save(checkpoint, out_file) + sha = subprocess.check_output(['sha256sum', out_file]).decode() + final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8]) + subprocess.Popen(['mv', out_file, final_file]) + + +def main(): + args = parse_args() + process_checkpoint(args.in_file, args.out_file) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/model_converters/regnet2mmdet.py b/model_examples/MapTR/tools/model_converters/regnet2mmdet.py new file mode 100644 index 0000000000000000000000000000000000000000..9dee3c878abc94c1298dcea6856e432a77339665 --- /dev/null +++ b/model_examples/MapTR/tools/model_converters/regnet2mmdet.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import torch +from collections import OrderedDict + + +def convert_stem(model_key, model_weight, state_dict, converted_names): + new_key = model_key.replace('stem.conv', 'conv1') + new_key = new_key.replace('stem.bn', 'bn1') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_head(model_key, model_weight, state_dict, converted_names): + new_key = model_key.replace('head.fc', 'fc') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_reslayer(model_key, model_weight, state_dict, converted_names): + split_keys = model_key.split('.') + layer, block, module = split_keys[:3] + block_id = int(block[1:]) + layer_name = f'layer{int(layer[1:])}' + block_name = f'{block_id - 1}' + + if block_id == 1 and module == 'bn': + new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}' + elif block_id == 1 and module == 'proj': + new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}' + elif module == 'f': + if split_keys[3] == 'a_bn': + module_name = 'bn1' + elif split_keys[3] == 'b_bn': + module_name = 'bn2' + elif split_keys[3] == 'c_bn': + module_name = 'bn3' + elif split_keys[3] == 'a': + module_name = 'conv1' + elif split_keys[3] == 'b': + module_name = 'conv2' + elif split_keys[3] == 'c': + module_name = 'conv3' + new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}' + else: + raise ValueError(f'Unsupported conversion of key {model_key}') + print(f'Convert {model_key} to {new_key}') + state_dict[new_key] = model_weight + converted_names.add(model_key) + + +def convert(src, dst): + """Convert keys in pycls pretrained RegNet models to mmdet style.""" + # load caffe model + regnet_model = torch.load(src) + blobs = regnet_model['model_state'] + # convert to pytorch style + state_dict = OrderedDict() + converted_names = set() + for key, weight in blobs.items(): + if 'stem' in key: + convert_stem(key, weight, state_dict, converted_names) + elif 'head' in key: + convert_head(key, weight, state_dict, converted_names) + elif key.startswith('s'): + convert_reslayer(key, weight, state_dict, converted_names) + + # check if all layers are converted + for key in blobs: + if key not in converted_names: + print(f'not converted: {key}') + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument('src', help='src detectron model path') + parser.add_argument('dst', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/test.py b/model_examples/MapTR/tools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..fd2cf4501870d6a12f4d14a613ebbb86578aa5ac --- /dev/null +++ b/model_examples/MapTR/tools/test.py @@ -0,0 +1,262 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import argparse +import mmcv +import os +import torch +import warnings +from mmcv import Config, DictAction +from mmcv.cnn import fuse_conv_bn +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, + wrap_fp16_model) + +from mmdet3d.apis import single_gpu_test +from mmdet3d.datasets import build_dataset +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from mmdet3d.models import build_model +from mmdet.apis import set_random_seed +from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test +from mmdet.datasets import replace_ImageToTensor +import time +import os.path as osp + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--out', help='output result file in pickle format') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + parser.add_argument( + '--format-only', + action='store_true', + help='Format the output results without perform evaluation. It is' + 'useful when you want to format the result to a specific format and ' + 'submit it to the test server') + parser.add_argument( + '--eval', + type=str, + nargs='+', + help='evaluation metrics, which depends on the dataset, e.g., "bbox",' + ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') + parser.add_argument('--show', action='store_true', help='show results') + parser.add_argument( + '--show-dir', help='directory where results will be saved') + parser.add_argument( + '--gpu-collect', + action='store_true', + help='whether to use gpu to collect results.') + parser.add_argument( + '--tmpdir', + help='tmp directory used for collecting results from multiple ' + 'workers, available when gpu-collect is not specified') + parser.add_argument('--seed', type=int, default=0, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function (deprecate), ' + 'change to --eval-options instead.') + parser.add_argument( + '--eval-options', + nargs='+', + action=DictAction, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.eval_options: + raise ValueError( + '--options and --eval-options cannot be both specified, ' + '--options is deprecated in favor of --eval-options') + if args.options: + warnings.warn('--options is deprecated in favor of --eval-options') + args.eval_options = args.options + return args + + +def main(): + args = parse_args() + + assert args.out or args.eval or args.format_only or args.show \ + or args.show_dir, \ + ('Please specify at least one operation (save/eval/format/show the ' + 'results / save the results) with the argument "--out", "--eval"' + ', "--format-only", "--show" or "--show-dir"') + + if args.eval and args.format_only: + raise ValueError('--eval and --format_only cannot be both specified') + + if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): + raise ValueError('The output file must be a pkl file.') + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + cfg.model.pretrained = None + # in case the test dataset is concatenated + samples_per_gpu = 1 + if isinstance(cfg.data.test, dict): + cfg.data.test.test_mode = True + samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) + if samples_per_gpu > 1: + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.test.pipeline = replace_ImageToTensor( + cfg.data.test.pipeline) + elif isinstance(cfg.data.test, list): + for ds_cfg in cfg.data.test: + ds_cfg.test_mode = True + samples_per_gpu = max( + [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) + if samples_per_gpu > 1: + for ds_cfg in cfg.data.test: + ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + + # set random seeds + if args.seed is not None: + set_random_seed(args.seed, deterministic=args.deterministic) + + # build the dataloader + dataset = build_dataset(cfg.data.test) + data_loader = build_dataloader( + dataset, + samples_per_gpu=samples_per_gpu, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False, + nonshuffler_sampler=cfg.data.nonshuffler_sampler, + ) + + # build the model and load checkpoint + cfg.model.train_cfg = None + model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') + if args.fuse_conv_bn: + model = fuse_conv_bn(model) + # old versions did not save class info in checkpoints, this walkaround is + # for backward compatibility + if 'CLASSES' in checkpoint.get('meta', {}): + model.CLASSES = checkpoint['meta']['CLASSES'] + else: + model.CLASSES = dataset.CLASSES + # palette for visualization in segmentation tasks + if 'PALETTE' in checkpoint.get('meta', {}): + model.PALETTE = checkpoint['meta']['PALETTE'] + elif hasattr(dataset, 'PALETTE'): + # segmentation dataset has `PALETTE` attribute + model.PALETTE = dataset.PALETTE + + if not distributed: + assert False + # model = MMDataParallel(model, device_ids=[0]) + # outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) + else: + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False) + outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir, + args.gpu_collect) + + rank, _ = get_dist_info() + if rank == 0: + if args.out: + print(f'\nwriting results to {args.out}') + assert False + #mmcv.dump(outputs['bbox_results'], args.out) + kwargs = {} if args.eval_options is None else args.eval_options + kwargs['jsonfile_prefix'] = osp.join('test', args.config.split( + '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_')) + if args.format_only: + dataset.format_results(outputs, **kwargs) + + if args.eval: + eval_kwargs = cfg.get('evaluation', {}).copy() + # hard-code way to remove EvalHook args + for key in [ + 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', + 'rule' + ]: + eval_kwargs.pop(key, None) + eval_kwargs.update(dict(metric=args.eval, **kwargs)) + + print(dataset.evaluate(outputs, **eval_kwargs)) + + +if __name__ == '__main__': + main() diff --git a/model_examples/MapTR/tools/train.py b/model_examples/MapTR/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..1760945cdb3e93d5bbb29cd2787dd74a8f237a67 --- /dev/null +++ b/model_examples/MapTR/tools/train.py @@ -0,0 +1,265 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from __future__ import division + +import argparse +import copy +import mmcv +import os +import time +import torch +import torch_npu #3 +import warnings +from mmcv import Config, DictAction +from mmcv.runner import get_dist_info, init_dist +from os import path as osp + +from mmdet import __version__ as mmdet_version +from mmdet3d import __version__ as mmdet3d_version +#from mmdet3d.apis import train_model + +from mmdet3d.datasets import build_dataset +from mmdet3d.models import build_model +from mmdet3d.utils import collect_env, get_root_logger +from mmdet.apis import set_random_seed +from mmseg import __version__ as mmseg_version + +from mmcv.utils import TORCH_VERSION, digit_version + +from torch_npu.contrib import transfer_to_npu + +torch.npu.config.allow_internal_format = False #3 +torch.npu.set_compile_mode(jit_compile=False) #3 + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument('--seed', type=int, default=0, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file (deprecate), ' + 'change to --cfg-options instead.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument('--local-rank', type=int, default=0) + parser.add_argument( + '--autoscale-lr', + action='store_true', + help='automatically scale lr with the number of gpus') + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.cfg_options: + raise ValueError( + '--options and --cfg-options cannot be both specified, ' + '--options is deprecated in favor of --cfg-options') + if args.options: + warnings.warn('--options is deprecated in favor of --cfg-options') + args.cfg_options = args.options + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + from projects.mmdet3d_plugin.bevformer.apis.train import custom_train_model + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + # if args.resume_from is not None: + if args.resume_from is not None and osp.isfile(args.resume_from): + cfg.resume_from = args.resume_from + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW': + cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw + if args.autoscale_lr: + # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) + cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + # dump config + cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + # specify logger name, if we still use 'mmdet', the output info will be + # filtered and won't be saved in the log_file + # TODO: ugly workaround to judge whether we are training det or seg model + if cfg.model.type in ['EncoderDecoder3D']: + logger_name = 'mmseg' + else: + logger_name = 'mmdet' + logger = get_root_logger( + log_file=log_file, log_level=cfg.log_level, name=logger_name) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + meta['config'] = cfg.pretty_text + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + if args.seed is not None: + logger.info(f'Set random seed to {args.seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(args.seed, deterministic=args.deterministic) + cfg.seed = args.seed + meta['seed'] = args.seed + meta['exp_name'] = osp.basename(args.config) + + model = build_model( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + model.init_weights() + + logger.info(f'Model:\n{model}') + datasets = [build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + # in case we use a dataset wrapper + if 'dataset' in cfg.data.train: + val_dataset.pipeline = cfg.data.train.dataset.pipeline + else: + val_dataset.pipeline = cfg.data.train.pipeline + # set test_mode=False here in deep copied config + # which do not affect AP/AR calculation later + # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa + val_dataset.test_mode = False + datasets.append(build_dataset(val_dataset)) + if cfg.checkpoint_config is not None: + # save mmdet version, config file content and class names in + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmdet_version=mmdet_version, + mmseg_version=mmseg_version, + mmdet3d_version=mmdet3d_version, + config=cfg.pretty_text, + CLASSES=datasets[0].CLASSES, + PALETTE=datasets[0].PALETTE # for segmentors + if hasattr(datasets[0], 'PALETTE') else None) + # add an attribute for visualization convenience + model.CLASSES = datasets[0].CLASSES + custom_train_model( + model, + datasets, + cfg, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) + + +if __name__ == '__main__': + main()