> opt_input_shape =
+ // {
+ // {"data_batch_0", {this->rec_batch_num, this->num_seg, 3, 224, 224}}
+ // };
+
+ // config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+ // opt_input_shape);
+ }
+ }
+ else
+ {
+ config.DisableGpu();
+ if (this->use_mkldnn_)
+ {
+ config.EnableMKLDNN();
+ // cache 10 different shapes for mkldnn to avoid memory leak
+ config.SetMkldnnCacheCapacity(10);
+ }
+ config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+ }
+
+ config.SwitchUseFeedFetchOps(false);
+ // true for multiple input
+ config.SwitchSpecifyInputNames(true);
+
+ config.SwitchIrOptim(true);
+
+ config.EnableMemoryOptim();
+ config.DisableGlogInfo();
+
+ this->predictor_ = CreatePredictor(config);
+ }
+
+} // namespace PaddleVideo
diff --git a/deploy/cpp_infer/tools/build.sh b/deploy/cpp_infer/tools/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c04ede0912abe934613489db43ca490c6b17d51b
--- /dev/null
+++ b/deploy/cpp_infer/tools/build.sh
@@ -0,0 +1,22 @@
+OPENCV_DIR=your_opencv_dir
+LIB_DIR=your_paddle_inference_dir
+CUDA_LIB_DIR=your_cuda_lib_dir
+CUDNN_LIB_DIR=your_cudnn_lib_dir
+TENSORRT_DIR=your_tensorRT_dir
+
+BUILD_DIR=build
+rm -rf ${BUILD_DIR}
+mkdir ${BUILD_DIR}
+cd ${BUILD_DIR}
+cmake .. \
+ -DPADDLE_LIB=${LIB_DIR} \
+ -DWITH_MKL=ON \
+ -DWITH_GPU=OFF \
+ -DWITH_STATIC_LIB=OFF \
+ -DWITH_TENSORRT=OFF \
+ -DOPENCV_DIR=${OPENCV_DIR} \
+ -DCUDNN_LIB=${CUDNN_LIB_DIR} \
+ -DCUDA_LIB=${CUDA_LIB_DIR} \
+ -DTENSORRT_DIR=${TENSORRT_DIR} \
+
+make -j
diff --git a/deploy/paddle2onnx/predict_onnx.py b/deploy/paddle2onnx/predict_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..47a223cd45afe17fd30eecf9eb342077a793b421
--- /dev/null
+++ b/deploy/paddle2onnx/predict_onnx.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+from os import path as osp
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../tools')))
+
+from utils import build_inference_helper, get_config
+
+
+def parse_args():
+ def str2bool(v):
+ return v.lower() in ("true", "t", "1")
+
+ # general params
+ parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+ parser.add_argument('-c',
+ '--config',
+ type=str,
+ default='configs/example.yaml',
+ help='config file path')
+ parser.add_argument("-i", "--input_file", type=str, help="input file path")
+ parser.add_argument("--onnx_file", type=str, help="onnx model file path")
+
+ # params for onnx predict
+ parser.add_argument("-b", "--batch_size", type=int, default=1)
+ parser.add_argument("--use_gpu",
+ type=str2bool,
+ default=False,
+ help="set to False when using onnx")
+ parser.add_argument("--precision", type=str, default="fp32")
+ parser.add_argument("--ir_optim", type=str2bool, default=True)
+ parser.add_argument("--enable_benchmark",
+ type=str2bool,
+ default=False,
+ help="set to False when using onnx")
+ parser.add_argument("--cpu_threads", type=int, default=4)
+
+ return parser.parse_args()
+
+
+def create_onnx_predictor(args, cfg=None):
+ import onnxruntime as ort
+ onnx_file = args.onnx_file
+ config = ort.SessionOptions()
+ if args.use_gpu:
+ raise ValueError(
+ "onnx inference now only supports cpu! please set `use_gpu` to False."
+ )
+ else:
+ config.intra_op_num_threads = args.cpu_threads
+ if args.ir_optim:
+ config.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+ predictor = ort.InferenceSession(onnx_file, sess_options=config)
+ return config, predictor
+
+
+def parse_file_paths(input_path: str) -> list:
+ if osp.isfile(input_path):
+ files = [
+ input_path,
+ ]
+ else:
+ files = os.listdir(input_path)
+ files = [
+ file for file in files
+ if (file.endswith(".avi") or file.endswith(".mp4"))
+ ]
+ files = [osp.join(input_path, file) for file in files]
+ return files
+
+
+def main():
+ """predict using onnx model
+ """
+ args = parse_args()
+ cfg = get_config(args.config, show=False)
+
+ model_name = cfg.model_name
+
+ print(f"Inference model({model_name})...")
+ InferenceHelper = build_inference_helper(cfg.INFERENCE)
+
+ inference_config, predictor = create_onnx_predictor(args)
+
+ # get input_tensor and output_tensor
+ input_names = predictor.get_inputs()[0].name
+ output_names = predictor.get_outputs()[0].name
+
+ # get the absolute file path(s) to be processed
+ files = parse_file_paths(args.input_file)
+ if args.enable_benchmark:
+ test_video_num = 12
+ num_warmup = 3
+ # instantiate auto log
+ try:
+ import auto_log
+ except ImportError as e:
+ print(f"{e}, [git+https://github.com/LDOUBLEV/AutoLog] "
+ f"package and it's dependencies is required for "
+ f"python-inference when enable_benchmark=True.")
+ pid = os.getpid()
+ autolog = auto_log.AutoLogger(
+ model_name=cfg.model_name,
+ model_precision=args.precision,
+ batch_size=args.batch_size,
+ data_shape="dynamic",
+ save_path="./output/auto_log.lpg",
+ inference_config=inference_config,
+ pids=pid,
+ process_name=None,
+ gpu_ids=None,
+ time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
+ warmup=num_warmup)
+ files = [args.input_file for _ in range(test_video_num + num_warmup)]
+
+ # Inferencing process
+ batch_num = args.batch_size
+ for st_idx in range(0, len(files), batch_num):
+ ed_idx = min(st_idx + batch_num, len(files))
+
+ # auto log start
+ if args.enable_benchmark:
+ autolog.times.start()
+
+ # Pre process batched input
+ batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx])
+
+ # get pre process time cost
+ if args.enable_benchmark:
+ autolog.times.stamp()
+
+ # run inference
+ batched_outputs = predictor.run(
+ output_names=[output_names],
+ input_feed={input_names: batched_inputs[0]})
+
+ # get inference process time cost
+ if args.enable_benchmark:
+ autolog.times.stamp()
+
+ InferenceHelper.postprocess(batched_outputs, not args.enable_benchmark)
+
+ # get post process time cost
+ if args.enable_benchmark:
+ autolog.times.end(stamp=True)
+
+ # time.sleep(0.01) # sleep for T4 GPU
+
+ # report benchmark log if enabled
+ if args.enable_benchmark:
+ autolog.report()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deploy/paddle2onnx/readme.md b/deploy/paddle2onnx/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..04bb9e77d887f8b0ce4615b8351ab883d42c3b2f
--- /dev/null
+++ b/deploy/paddle2onnx/readme.md
@@ -0,0 +1,70 @@
+# paddle2onnx 模型转化与预测
+
+本章节介绍 PP-TSN 模型如何转化为 ONNX 模型,并基于 ONNX 引擎预测。
+
+## 1. 环境准备
+
+需要准备 Paddle2ONNX 模型转化环境,和 ONNX 模型预测环境。
+
+Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式,算子目前稳定支持导出 ONNX Opset 9~11,部分Paddle算子支持更低的ONNX Opset转换。
+更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)
+
+- 安装 Paddle2ONNX
+```bash
+python3.7 -m pip install paddle2onnx
+```
+
+- 安装 ONNXRuntime
+```bash
+# 建议安装 1.9.0 版本,可根据环境更换版本号
+python3.7 -m pip install onnxruntime==1.9.0
+```
+
+## 2. 模型转换
+
+- PP-TSN inference模型下载
+
+ ```bash
+ # 下载inference模型到PaddleVideo/inference/ppTSN/ 目录下
+ mkdir -p ./inference
+ wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip
+
+ # 解压inference模型
+ pushd ./inference
+ unzip ppTSN.zip
+ popd
+ ```
+
+- 模型转换
+
+ 使用 Paddle2ONNX 将 Paddle inference模型转换为 ONNX 格式模型:
+
+ ```bash
+ paddle2onnx \
+ --model_dir=./inference/ppTSN \
+ --model_filename=ppTSN.pdmodel \
+ --params_filename=ppTSN.pdiparams \
+ --save_file=./inference/ppTSN/ppTSN.onnx \
+ --opset_version=10 \
+ --enable_onnx_checker=True
+ ```
+执行完毕后,可以发现 `./inference/ppTSN` 目录下生成了一个 ONNX 格式的模型文件 `ppTSN.onnx`
+
+## 3. onnx 预测
+
+接下来就可以用 ONNX 格式模型进行预测,其用法与paddle 预测模型类似
+执行如下命令:
+```bash
+python3.7 deploy/paddle2onnx/predict_onnx.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsn/pptsn_k400_videos.yaml \
+--onnx_file=./inference/ppTSN/ppTSN.onnx
+```
+
+结果如下:
+```bash
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9998553991317749
+```
+可以验证该结果与Paddle inference的预测结果完全一致
diff --git a/deploy/paddle2onnx/readme_en.md b/deploy/paddle2onnx/readme_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fe67726fbc0ed996480e59eb28d99350254feae
--- /dev/null
+++ b/deploy/paddle2onnx/readme_en.md
@@ -0,0 +1,70 @@
+# paddle2onnx model conversion and prediction
+
+This chapter describes how the PP-TSN model is transformed into an ONNX model and predicted based on the ONNX engine.
+
+## 1. Environment preparation
+
+Need to prepare Paddle2ONNX model conversion environment, and ONNX model prediction environment.
+
+Paddle2ONNX supports converting the PaddlePaddle model format to the ONNX model format. The operator currently supports exporting ONNX Opset 9~11 stably, and some Paddle operators support lower ONNX Opset conversion.
+For more details, please refer to [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)
+
+- Install Paddle2ONNX
+```bash
+python3.7 -m pip install paddle2onnx
+```
+
+- Install ONNXRuntime
+```bash
+# It is recommended to install version 1.9.0, and the version number can be changed according to the environment
+python3.7 -m pip install onnxruntime==1.9.0
+```
+
+## 2. Model conversion
+
+- PP-TSN inference model download
+
+ ```bash
+ # Download the inference model to the PaddleVideo/inference/ppTSN/ directory
+ mkdir -p ./inference
+ wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip
+
+ # Decompress the inference model
+ pushd ./inference
+ unzip ppTSN.zip
+ popd
+ ```
+
+- Model conversion
+
+ Convert Paddle inference models to ONNX format models using Paddle2ONNX:
+
+ ```bash
+ paddle2onnx \
+ --model_dir=./inference/ppTSN \
+ --model_filename=ppTSN.pdmodel \
+ --params_filename=ppTSN.pdiparams \
+ --save_file=./inference/ppTSN/ppTSN.onnx \
+ --opset_version=10 \
+ --enable_onnx_checker=True
+ ```
+After execution, you can find that a model file `ppTSN.onnx` in ONNX format is generated in the `./inference/ppTSN` directory
+
+## 3. onnx prediction
+
+Next, you can use the ONNX format model for prediction, which is similar to the paddle prediction model
+Execute the following command:
+```bash
+python3.7 deploy/paddle2onnx/predict_onnx.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsn/pptsn_k400_videos.yaml \
+--onnx_file=./inference/ppTSN/ppTSN.onnx
+```
+
+The result is as follows:
+```bash
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9998553991317749
+```
+It can be verified that the result is completely consistent with the prediction result of Paddle inference
diff --git a/deploy/slim/quant_post_static.py b/deploy/slim/quant_post_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..84b2803e1c60bb4f674c2cf504b9dde1f50c20d5
--- /dev/null
+++ b/deploy/slim/quant_post_static.py
@@ -0,0 +1,120 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import os.path as osp
+import sys
+
+import numpy as np
+import paddle
+from paddleslim.quant import quant_post_static
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+from paddlevideo.loader.builder import build_dataloader, build_dataset
+from paddlevideo.utils import get_config, get_logger
+
+
+def parse_args():
+ def str2bool(v):
+ return v.lower() in ("true", "t", "1")
+
+ parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+ parser.add_argument(
+ '-c',
+ '--config',
+ type=str,
+ default=
+ '../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml',
+ help='quantization config file path')
+ parser.add_argument('-o',
+ '--override',
+ action='append',
+ default=[],
+ help='config options to be overridden')
+ parser.add_argument("--use_gpu",
+ type=str2bool,
+ default=True,
+ help="whether use gpui during quantization")
+
+ return parser.parse_args()
+
+
+def post_training_quantization(cfg, use_gpu: bool = True):
+ """Quantization entry
+
+ Args:
+ cfg (dict): quntization configuration.
+ use_gpu (bool, optional): whether to use gpu during quantization. Defaults to True.
+ """
+ logger = get_logger("paddlevideo")
+
+ place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+
+ # get defined params
+ batch_size = cfg.DATASET.get('batch_size', 1)
+ num_workers = cfg.DATASET.get('num_workers', 0)
+ inference_file_name = cfg.get('model_name', 'inference')
+ inference_model_dir = cfg.get('inference_model_dir',
+ f'./inference/{inference_file_name}')
+ quant_output_dir = cfg.get('quant_output_dir',
+ osp.join(inference_model_dir, 'quant_model'))
+ batch_nums = cfg.get('batch_nums', 10)
+
+ # build dataloader for quantization, lite data is enough
+ slim_dataset = build_dataset((cfg.DATASET.quant, cfg.PIPELINE.quant))
+ slim_dataloader_setting = dict(batch_size=batch_size,
+ num_workers=num_workers,
+ places=place,
+ drop_last=False,
+ shuffle=False)
+ slim_loader = build_dataloader(slim_dataset, **slim_dataloader_setting)
+
+ logger.info("Build slim_loader finished")
+
+ def sample_generator(loader):
+ def __reader__():
+ for indx, data in enumerate(loader):
+ # must return np.ndarray, not paddle.Tensor
+ videos = np.array(data[0])
+ yield videos
+
+ return __reader__
+
+ # execute quantization in static graph mode
+ paddle.enable_static()
+
+ exe = paddle.static.Executor(place)
+
+ logger.info("Staring Post-Training Quantization...")
+
+ quant_post_static(executor=exe,
+ model_dir=inference_model_dir,
+ quantize_model_path=quant_output_dir,
+ sample_generator=sample_generator(slim_loader),
+ model_filename=f'{inference_file_name}.pdmodel',
+ params_filename=f'{inference_file_name}.pdiparams',
+ batch_size=batch_size,
+ batch_nums=batch_nums,
+ algo='KL')
+
+ logger.info("Post-Training Quantization finished...")
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ cfg = get_config(args.config, overrides=args.override)
+ post_training_quantization(cfg, args.use_gpu)
diff --git a/deploy/slim/readme.md b/deploy/slim/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc9f764f8678b5ab8c69352a18401b324b35a5eb
--- /dev/null
+++ b/deploy/slim/readme.md
@@ -0,0 +1,133 @@
+
+## Slim功能介绍
+复杂的模型有利于提高模型的性能,但也导致模型中存在一定冗余。此部分提供精简模型的功能,包括两部分:模型量化(量化训练、离线量化)、模型剪枝。
+
+其中模型量化将全精度缩减到定点数减少这种冗余,达到减少模型计算复杂度,提高模型推理性能的目的。
+模型量化可以在基本不损失模型的精度的情况下,将FP32精度的模型参数转换为Int8精度,减小模型参数大小并加速计算,使用量化后的模型在移动端等部署时更具备速度优势。
+
+模型剪枝将CNN中不重要的卷积核裁剪掉,减少模型参数量,从而降低模型计算复杂度。
+
+本教程将介绍如何使用飞桨模型压缩库PaddleSlim做PaddleVideo模型的压缩。
+[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 集成了模型剪枝、量化(包括量化训练和离线量化)、蒸馏和神经网络搜索等多种业界常用且领先的模型压缩功能,如果您感兴趣,可以关注并了解。
+
+在开始本教程之前,建议先了解[PaddleVideo模型的训练方法](../../docs/zh-CN/usage.md)以及[PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/latest/index.html)
+
+
+## 快速开始
+当训练出一个模型后,如果希望进一步的压缩模型大小并加速预测,可使用量化或者剪枝的方法压缩模型。
+
+模型压缩主要包括五个步骤:
+1. 安装 PaddleSlim
+2. 准备训练好的模型
+3. 模型压缩
+4. 导出量化推理模型
+5. 量化模型预测部署
+
+### 1. 安装PaddleSlim
+
+* 可以通过pip install的方式进行安装。
+
+```bash
+python3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+* 如果获取PaddleSlim的最新特性,可以从源码安装。
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd Paddleslim
+python3.7 setup.py install
+```
+
+### 2. 准备训练好的模型
+
+PaddleVideo提供了一系列训练好的[模型](../../docs/zh-CN/model_zoo/README.md),如果待量化的模型不在列表中,需要按照[常规训练](../../docs/zh-CN/usage.md)方法得到训练好的模型。
+
+### 3. 模型压缩
+
+进入PaddleVideo根目录
+
+```bash
+cd PaddleVideo
+```
+
+离线量化代码位于`deploy/slim/quant_post_static.py`。
+
+#### 3.1 模型量化
+
+量化训练包括离线量化训练和在线量化训练(TODO),在线量化训练效果更好,需加载预训练模型,在定义好量化策略后即可对模型进行量化。
+
+##### 3.1.1 在线量化训练
+TODO
+
+##### 3.1.2 离线量化
+
+**注意**:目前离线量化,必须使用已经训练好的模型导出的`inference model`进行量化。一般模型导出`inference model`可参考[教程](../../docs/zh-CN/usage.md#5-模型推理).
+
+一般来说,离线量化损失模型精度较多。
+
+以PP-TSM模型为例,生成`inference model`后,离线量化运行方式如下
+
+```bash
+# 下载并解压出少量数据用于离线量化的校准
+pushd ./data/k400
+wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+tar -xf k400_rawframes_small.tar
+popd
+
+# 然后进入deploy/slim目录下
+cd deploy/slim
+
+# 执行离线量化命令
+python3.7 quant_post_static.py \
+-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \
+--use_gpu=True
+```
+
+除`use_gpu`外,所有的量化环境参数都在`pptsm_k400_frames_uniform_quantization.yaml`文件中进行配置
+其中`inference_model_dir`表示上一步导出的`inference model`目录路径,`quant_output_dir`表示量化模型的输出目录路径
+
+执行成功后,在`quant_output_dir`的目录下生成了`__model__`文件和`__params__`文件,这二者用于存储生成的离线量化模型
+类似`inference model`的使用方法,接下来可以直接用这两个文件进行预测部署,无需再重新导出模型。
+
+```bash
+# 使用PP-TSM离线量化模型进行预测
+# 回到PaddleVideo目录下
+cd ../../
+
+# 使用量化模型进行预测
+python3.7 tools/predict.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+--model_file ./inference/ppTSM/quant_model/__model__ \
+--params_file ./inference/ppTSM/quant_model/__params__ \
+--use_gpu=True \
+--use_tensorrt=False
+```
+
+输出如下:
+```bash
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9997928738594055
+```
+#### 3.2 模型剪枝
+TODO
+
+
+### 4. 导出模型
+TODO
+
+
+### 5. 模型部署
+
+上述步骤导出的模型可以通过PaddleLite的opt模型转换工具完成模型转换。
+模型部署的可参考
+[Serving Python部署](../python_serving/readme.md)
+[Serving C++部署](../cpp_serving/readme.md)
+
+
+## 训练超参数建议
+
+* 量化训练时,建议加载常规训练得到的预训练模型,加速量化训练收敛。
+* 量化训练时,建议初始学习率修改为常规训练的`1/20~1/10`,同时将训练epoch数修改为常规训练的`1/5~1/2`,学习率策略方面,加上Warmup,其他配置信息不建议修改。
diff --git a/deploy/slim/readme_en.md b/deploy/slim/readme_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7f6ba20ad2ecc7bc62807b3efc0c09bbe62d6ea
--- /dev/null
+++ b/deploy/slim/readme_en.md
@@ -0,0 +1,132 @@
+## Slim function introduction
+A complex model is beneficial to improve the performance of the model, but it also leads to some redundancy in the model. This part provides the function of reducing the model, including two parts: model quantization (quantization training, offline quantization), model pruning.
+
+Among them, model quantization reduces the full precision to fixed-point numbers to reduce this redundancy, so as to reduce the computational complexity of the model and improve the inference performance of the model.
+Model quantization can convert FP32-precision model parameters to Int8-precision without losing the accuracy of the model, reducing the size of model parameters and speeding up the calculation. Using the quantized model has a speed advantage when deploying on mobile terminals.
+
+Model pruning cuts out the unimportant convolution kernels in the CNN, reduces the amount of model parameters, and thus reduces the computational complexity of the model.
+
+This tutorial will introduce how to use PaddleSlim, a paddle model compression library, to compress PaddleVideo models.
+[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) integrates model pruning, quantization (including quantization training and offline quantization), distillation and neural network search and other commonly used and leading model compression functions in the industry. If you are interested, you can follow and understand.
+
+Before starting this tutorial, it is recommended to understand [PaddleVideo model training method](../../docs/zh-CN/usage.md) and [PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/ latest/index.html)
+
+
+## quick start
+After training a model, if you want to further compress the model size and speed up prediction, you can use quantization or pruning to compress the model.
+
+Model compression mainly includes five steps:
+1. Install PaddleSlim
+2. Prepare the trained model
+3. Model Compression
+4. Export the quantitative inference model
+5. Quantitative Model Prediction Deployment
+
+### 1. Install PaddleSlim
+
+* It can be installed by pip install.
+
+```bash
+python3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+* If you get the latest features of PaddleSlim, you can install it from source.
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd Paddleslim
+python3.7 setup.py install
+```
+
+### 2. Prepare the trained model
+
+PaddleVideo provides a series of trained [models](../../docs/zh-CN/model_zoo/README.md). If the model to be quantized is not in the list, you need to follow the [regular training](../ ../docs/zh-CN/usage.md) method to get the trained model.
+
+### 3. Model Compression
+
+Go to PaddleVideo root directory
+
+```bash
+cd PaddleVideo
+```
+
+The offline quantization code is located in `deploy/slim/quant_post_static.py`.
+
+#### 3.1 Model Quantization
+
+Quantization training includes offline quantization training and online quantization training (TODO). The effect of online quantization training is better. The pre-training model needs to be loaded, and the model can be quantized after the quantization strategy is defined.
+
+##### 3.1.1 Online quantitative training
+TODO
+
+##### 3.1.2 Offline Quantization
+
+**Note**: For offline quantization, you must use the `inference model` exported from the trained model for quantization. For general model export `inference model`, please refer to [Tutorial](../../docs/zh-CN/usage.md#5-Model Inference).
+
+Generally speaking, the offline quantization loss model has more accuracy.
+
+Taking the PP-TSM model as an example, after generating the `inference model`, the offline quantization operation is as follows
+
+```bash
+# download a small amount of data for calibration
+pushd ./data/k400
+wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+tar -xf k400_rawframes_small.tar
+popd
+
+# then switch to deploy/slim
+cd deploy/slim
+
+# execute quantization script
+python3.7 quant_post_static.py \
+-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \
+--use_gpu=True
+```
+
+All quantization environment parameters except `use_gpu` are configured in `pptsm_k400_frames_uniform_quantization.yaml` file
+Where `inference_model_dir` represents the directory path of the `inference model` exported in the previous step, and `quant_output_dir` represents the output directory path of the quantization model
+
+After successful execution, the `__model__` file and the `__params__` file are generated in the `quant_output_dir` directory, which are used to store the generated offline quantization model
+Similar to the usage of `inference model`, you can directly use these two files for prediction deployment without re-exporting the model.
+
+```bash
+# Use PP-TSM offline quantization model for prediction
+# Go back to the PaddleVideo directory
+cd ../../
+
+# Use the quantized model to make predictions
+python3.7 tools/predict.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+--model_file ./inference/ppTSM/quant_model/__model__ \
+--params_file ./inference/ppTSM/quant_model/__params__ \
+--use_gpu=True \
+--use_tensorrt=False
+```
+
+The output is as follows:
+```bash
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9997928738594055
+```
+#### 3.2 Model pruning
+TODO
+
+
+### 4. Export the model
+TODO
+
+
+### 5. Model Deployment
+
+The model exported in the above steps can be converted through the opt model conversion tool of PaddleLite.
+Reference for model deployment
+[Serving Python Deployment](../python_serving/readme.md)
+[Serving C++ Deployment](../cpp_serving/readme.md)
+
+
+## Training hyperparameter suggestions
+
+* During quantitative training, it is recommended to load the pre-trained model obtained from regular training to accelerate the convergence of quantitative training.
+* During quantitative training, it is recommended to modify the initial learning rate to `1/20~1/10` of conventional training, and modify the number of training epochs to `1/5~1/2` of conventional training. In terms of learning rate strategy, add On Warmup, other configuration information is not recommended to be modified.
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..353a649f5d2bb4f8d23d834288dc7b08046a62df
--- /dev/null
+++ b/docs/CONTRIBUTING.md
@@ -0,0 +1,3 @@
+# Contribution Guidelines
+
+We appreciate all contributions. If you are planning to contribute back bug-fixes, docs fixes, please do so without any further discussion. If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us.
diff --git a/docs/en/benchmark.md b/docs/en/benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc41dfe12c2ee3b6f6383dd7782274860efd7cb1
--- /dev/null
+++ b/docs/en/benchmark.md
@@ -0,0 +1,69 @@
+[简体中文](../zh-CN/benchmark.md) | English
+# Benchmark
+
+We compare our results with some popular frameworks and official releases in terms of speed.
+
+## Environment
+
+### Hardware
+
+- 8 NVIDIA Tesla V100 (16G) GPUs
+- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+### Software
+
+- Python 3.7
+- PaddlePaddle2.0
+- CUDA 10.1
+- CUDNN 7.6.3
+- NCCL 2.1.15
+- GCC 8.2.0
+
+## Experiments and Statistics
+The statistic is the average training time, including data processing and model training time, and the training speed is measured with ips(instance per second). Note that we skip the first 50 iters as they may contain the device warmup time.
+
+Here we compare PaddleVideo with the other video understanding toolkits in the same data and model settings.
+
+To ensure the fairness of the comparison, the comparison experiments were conducted under the same hardware environment and using the same dataset. The dataset we used is generated by the [data preparation](dataset/k400.md), and in each model setting, the same data preprocessing methods are applied to make sure the same feature input.
+
+Significant improvement can be observed when comparing with other video understanding framework as shown in the table below, Especially the [Slowfast](../../configs/recognition/slowfast/slowfast.yaml) model is nearly 2x faster than the counterparts.
+
+
+
+## Results
+### Recognizers
+
+| Model | batch size x gpus | PaddleVideo(ips) | Reference(ips) | MMAction2 (ips) | PySlowFast (ips)|
+| :------: | :-------------------:|:---------------:|:---------------: | :---------------: |:---------------: |
+| [TSM](../../configs/recognition/tsm/tsm.yaml) | 16x8 | 58.1 | 46.04(temporal-shift-module) | To do | X |
+| [PPTSM](../../configs/recognition/tsm/pptsm.yaml) | 16x8 | 57.6 | X | X | X |
+| [TSN](../../configs/recognition/tsn/tsn.yaml) | 16x8 | 841.1 | To do (tsn-pytorch) | To do | X |
+| [Slowfast](../../configs/recognition/slowfast/slowfast.yaml)| 16x8 | 99.5 | X | To do | 43.2 |
+| [Attention_LSTM](../../configs/recognition/attention_lstm/attention_lstm.yaml) | 128x8 | 112.6 | X | X | X |
+
+
+### Localizers
+
+| Model | PaddleVideo(ips) |MMAction2 (ips) |BMN(boundary matching network) (ips)|
+| :--- | :---------------: | :-------------------------------------: | :-------------------------------------: |
+| [BMN](../../configs/localization/bmn.yaml) | 43.84 | x | x |
+
+
+### Segmenters
+
+This repo provides performance and accuracy comparison between classical and popular sequential action segmentation models
+
+| Model | Metrics | Value | Flops(M) |Params(M) | test time(ms) bs=1 | test time(ms) bs=2 | inference time(ms) bs=1 | inference time(ms) bs=2 |
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| MS-TCN | F1@0.5 | 38.8% | 791.360 | 0.8 | 170 | - | 10.68 | - |
+| ASRF | F1@0.5 | 55.7% | 1,283.328 | 1.3 | 190 | - | 16.34 | - |
+
+* Model: model name, for example: PP-TSM
+* Metrics: Fill in the indicators used in the model test, and the data set used is **breakfast**
+* Value: Fill in the value corresponding to the metrics index, and generally keep two decimal places
+* Flops(M): The floating-point computation required for one forward operation of the model can be called `paddlevideo/tools/summary.py`script calculation (different models may need to be modified slightly), keep one decimal place, and measure it with data **input tensor with shape of (1, 2048, 1000)**
+* Params(M): The model parameter quantity, together with flops, will be calculated by the script, and one decimal place will be reserved
+* test time(ms) bs=1: When the python script starts the batchsize = 1 test, the time required for a sample is kept to two decimal places. The data set used in the test is **breakfast**.
+* test time(ms) bs=2: When the python script starts the batchsize = 2 test, the time required for a sample is kept to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The data set used in the test is **breakfast**.
+* inference time(ms) bs=1: When the reasoning model is tested with GPU (default V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The dataset used for reasoning is **breakfast**.
+* inference time(ms) bs=2: When the reasoning model is tested with GPU (default V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The dataset used for reasoning is **breakfast**.
diff --git a/docs/en/dataset/AVA.md b/docs/en/dataset/AVA.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee95a57755cba7261cb9e34e665c27d6c1e37845
--- /dev/null
+++ b/docs/en/dataset/AVA.md
@@ -0,0 +1,113 @@
+[简体中文](../../zh-CN/dataset/k400.md) | English
+# AVA Data Preparation
+This document mainly introduces the preparation process of AVA dataset.
+It mainly includes five parts: Video Data Download, Prepare Annotations, Cut video files,
+Extract the RGB frames, Pulling Proposal Files,et al.
+Before we start, please make sure that the directory is located at `$PaddleVideo/data/ava/script`.
+
+
+---
+
+
+## 1. Video data Download
+For basic dataset information, you can refer to the official website [AVA](https://research.google.com/ava/index.html).
+For the dataset download, you can refer to the [AVA Download](https://github.com/cvdfoundation/ava-dataset) ,
+which introduce the way to download the dataset. We also provide the shell script for downloading the video files
+
+```shell
+bash download_videos.sh
+```
+
+Furthermore,considering the difficulty in downloading,
+we upload the video files to Baidu cloud disk in the form of zip packages, and users can download it by themselves according to their needs.
+[Link]() coming soon.
+
+
+**Note: the video files should be placed in `data/ava/videos`**
+
+---
+## 2.Prepare Annotations
+
+Next, you can run the following script to prepare annotations.
+
+```shell
+bash download_annotations.sh
+```
+
+This command will download `ava_v2.1.zip` for AVA `v2.1` annotation. If you need the AVA `v2.2` annotation, you can try the following script.
+
+```shell
+VERSION=2.2 bash download_annotations.sh
+```
+
+**Note: In fact,we will also provide the annotation zip files in Baidu cloud disk**
+
+---
+## 3. cut video files
+
+Cut each video from its 15th to 30th minute and make them at 30 fps.
+
+```shell
+bash cut_videos.sh
+```
+---
+
+## 4. Extract RGB Frames
+
+you can use the ffmpeg to extract RGB frames by the following script.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+---
+
+## 5.Pulling Proposal Files
+
+The scripts are adapted from FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks).
+
+Run the follow scripts to fetch pre-computed proposal list.
+
+```shell
+bash fetch_ava_proposals.sh
+```
+
+---
+## 6.Folder Structure
+
+After the whole data pipeline for AVA preparation.
+you can get the rawframes (RGB), videos and annotation files for AVA.
+
+In the context of the whole project (for AVA only), the folder structure will look like:
+
+```
+PaddleVideo
+├── configs
+├── paddlevideo
+├── docs
+├── tools
+├── data
+│ ├── ava
+│ │ ├── annotations
+│ │ | ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl
+│ │ | ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl
+│ │ | ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl
+│ │ | ├── ava_train_v2.1.csv
+│ │ | ├── ava_val_v2.1.csv
+│ │ | ├── ava_train_excluded_timestamps_v2.1.csv
+│ │ | ├── ava_val_excluded_timestamps_v2.1.csv
+│ │ | ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt
+│ │ ├── videos
+│ │ │ ├── 053oq2xB3oU.mkv
+│ │ │ ├── 0f39OWEqJ24.mp4
+│ │ │ ├── ...
+│ │ ├── videos_15min
+│ │ │ ├── 053oq2xB3oU.mkv
+│ │ │ ├── 0f39OWEqJ24.mp4
+│ │ │ ├── ...
+│ │ ├── rawframes
+│ │ │ ├── 053oq2xB3oU
+| │ │ │ ├── img_00001.jpg
+| │ │ │ ├── img_00002.jpg
+| │ │ │ ├── ...
+```
diff --git a/docs/en/dataset/ActivityNet.md b/docs/en/dataset/ActivityNet.md
new file mode 100644
index 0000000000000000000000000000000000000000..006a93670f76d533e91732e5c1d2b4cb15a56efb
--- /dev/null
+++ b/docs/en/dataset/ActivityNet.md
@@ -0,0 +1,80 @@
+[简体中文](../../zh-CN/dataset/ActivityNet.md) | English
+
+# ActivityNet data preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+
+## Introduction
+
+ActivityNet is a dataset for large-scale video understanding tasks, which can be used for tasks such as action localization, action recognition, etc.
+
+
+## Download
+1. The BMN model uses the processed ActivityNet 1.3 dataset. There are two ways to use it:
+ - Using our processed ActivityNet 1.3 dataset (compressed package is about 5.5G), each video has corresponding action labels, duration intervals, duration frames, duration seconds and other information
+ Download with the following command:
+ ```bash
+ wget https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz # Download the processed video feature data
+ wget https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json # Download the processed label data
+ ```
+
+ Or click the following hyperlinks to download:
+
+ [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)
+ [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)
+
+ then decompression `bmn_feat.tar.gz`
+ ```bash
+ tar -xf bmn_feat.tar.gz
+ ```
+
+ - Extract features by yourself
+
+ First refer to [Download Instructions](https://github.com/activitynet/ActivityNet/tree/master/Crawler) to download the original dataset. When training this model, you need to use TSN to extract features from the source files first. You can [self-extract](https://github.com/yjxiong/temporal-segment-networks) video frame and optical flow information, and the pre-trained TSN model can be downloaded from [here](https://github.com/ yjxiong/anet2016-cuhk) download.
+
+
+ The information in the `activitynet_1.3_annotations.json` tag file is as follows:
+ ```json
+ {
+ "v_QOlSCBRmfWY": {
+ "duration_second": 82.73,
+ "subset": "training",
+ "duration_frame": 2067,
+ "annotations": [{
+ "segment": [6.195294851794072, 77.73085420904837],
+ "label": "Ballet"
+ }],
+ "feature_frame": 2064
+ },
+ "v_ehGHCYKzyZ8": {
+ "duration_second": 61.7189999999999994,
+ "subset": "training",
+ "duration_frame": 1822,
+ "annotations": [{
+ "segment": [43.95990729267573, 45.401932082395355],
+ "label": "Doing crunches"
+ }],
+ "feature_frame": 1808
+ },
+ ...,
+ ...
+ }
+ ```
+
+ In the end, `19228` video feature npy files are obtained, corresponding to the `19228` label information in the `activitynet_1.3_annotations.json` file.
+
+2. Create a new `data/bmn_data` folder, and then unzip the video feature data after downloading and put it in this folder, and finally it should be organized into the following form:
+ ```
+ PaddleVideo
+ ├── data
+ │ ├── bmn_data
+ │ │ ├── fix_feat_100
+ │ │ │ ├── v___c8enCfzqw.npy
+ │ │ │ ├── v___dXUJsj3yo.npy
+ │ │ │ ├── ...
+ │ │ │
+ │ │ └── activitynet_1.3_annotations.json
+ ```
+
+3. Finally, modify the `feat_path` field in the configuration file configs/localization/bmn.yaml to specify the feature directory path, and the `file_path` field to specify the label file path.
diff --git a/docs/en/dataset/Oxford_RobotCar.md b/docs/en/dataset/Oxford_RobotCar.md
new file mode 100644
index 0000000000000000000000000000000000000000..c02b54a0132f08b923c1c7ec5093ae225e75d15d
--- /dev/null
+++ b/docs/en/dataset/Oxford_RobotCar.md
@@ -0,0 +1,162 @@
+[简体中文](../../zh-CN/dataset/Oxford_RobotCar.md) | English
+
+# Oxford-RobotCar-for-ADDS data preparation
+
+- [Introduction](#Introduction)
+- [Data Set Download](#Download)
+- [Preprocessing](#Preprocessing)
+- [1. Image De-distortion](#1-Image-de-distortion)
+- [2. Dynamic frame filter](#2-Dynamic-frame-filter)
+- [3. Image Rename](#3-Image-Rename)
+- [4. Preparation for Day-Pseudo Night Image Pair](#4-Day-Pseudo-Night-Image-Pair-Preparation)
+
+
+## Introduction
+
+[Oxford RobotCar Dataset](https://robotcar-dataset.robots.ox.ac.uk/) is a large-scale autonomous driving data set that contains a large amount of data in different autonomous driving scenarios.
+
+What is used here is to filter a part of the data used for day-night depth estimation from the original Oxford RobotCar data set, namely Oxford-RobotCar-for-ADDS.
+
+If you want to use Oxford-RobotCar-for-ADDS, please cite the following papers:
+```latex
+@article{maddern20171,
+ title={1 year, 1000 km: The oxford robotcar dataset},
+ author={Maddern, Will and Pascoe, Geoffrey and Linegar, Chris and Newman, Paul},
+ journal={The International Journal of Robotics Research},
+ volume={36},
+ number={1},
+ pages={3--15},
+ year={2017},
+ publisher={SAGE Publications Sage UK: London, England}
+}
+```
+```latex
+@inproceedings{liu2021self,
+ title={Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation},
+ author={Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={12737--12746},
+ year={2021}
+}
+```
+
+## Download
+
+1. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-09](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-09-13-21-02/) as For the training set of the daytime scene, the downloaded images are decompressed in the same folder.
+2. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-16](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-16-18-44-24/) as The training set of the night scene, the downloaded images are unzipped in the same folder.
+3. The images and depth truth values of the validation set are filtered from the original data set and downloaded from the link we gave. (The data download links are below)
+ ```shell
+ https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt
+ https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.001
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.002
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.001
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.002
+ https://videotag.bj.bcebos.com/Data/ADDS/day_val_451.7z
+ https://videotag.bj.bcebos.com/Data/ADDS/day_val_451_gt.7z
+ https://videotag.bj.bcebos.com/Data/ADDS/night_val_411.7z
+ https://videotag.bj.bcebos.com/Data/ADDS/night_val_411_gt.7z
+ ```
+ the original raw data download links:
+ ```shell
+ # data in day
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.001
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.002
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.003
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.004
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.005
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.006
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.007
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.008
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.009
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.010
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.011
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.012
+
+ # data in night
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.001
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.002
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.003
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.004
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.005
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.006
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.007
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.008
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.009
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.010
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.011
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.012
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.013
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.014
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.015
+ ```
+
+## Preprocessing
+
+### 1-Image-de-distortion
+
+Use the official toolbox [robotcar-dataset-sdk](https://github.com/ori-mrg/robotcar-dataset-sdk/tree/master/python) to pair the sequence 2014-12-09 and 2014-12- The image of 16 is de-distorted.
+
+
+### 2-Dynamic-frame-filter
+
+Since we use the self-supervised method, we need to filter out dynamic frames for training. The filtering principle is that the inter-frame pose change is greater than 0.1m and it is considered a dynamic frame. After filtering, the sequence of the training set is obtained.
+
+
+### 3-Image-Rename
+
+Rename the original image timestamp to a continuous number sequence. For daytime scene correspondence, see [1209_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt), for night scene correspondence, see [1216_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt). The renamed data format is as follows:
+```
+├── oxford_processing
+ ├── day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.012)
+ ├── night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.015)
+ ├── day_val_451 #Daytime verification image folder (day_val_451.7z)
+ ├── day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)
+ ├── night_val_411 #night verification image folder (night_val_411.7z)
+ └── night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)
+```
+
+annotation files download links are below:
+```shell
+https://videotag.bj.bcebos.com/Data/ADDS/train_files.txt
+https://videotag.bj.bcebos.com/Data/ADDS/val_day_files.txt
+https://videotag.bj.bcebos.com/Data/ADDS/val_night_files.txt
+```
+
+The sequence used for training and verification is as follows:
+
+```
+splits/oxford_day/train_files.txt # training sequence during the day
+splits/oxford_night/train_files.txt # training sequence at night
+splits/oxford_day_451/val_files.txt # verification sequence during the day
+splits/oxford_night_411/val_files.txt # night verification sequence
+```
+
+### 4-Day-Pseudo-Night-Image-Pair-Preparation
+
+In order to use our framework to extract the common information of day and night images, we use [CycleGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix) to generate day-pseudo-night image pairs, where pseudo-night The night images corresponding to the daytime generated for CycleGAN, all images are scaled to 192x640, the night images are enhanced with histogram equalization, 75 epochs are trained, and the Oxford-RobotCar-for-ADDS is finally obtained. The generated day-pseudo-night image pair The data format is as follows, which can be directly used for training and verification of ADDS-DepthNet:
+```
+├── oxford_processing_forADDS
+ ├── day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.002)
+ ├── night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.002)
+ ├── day_val_451 #Daytime verification image folder (day_val_451.7z)
+ ├── day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)
+ ├── night_val_411 #night verification image folder (night_val_411.7z)
+ └── night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)
+data
+└── oxford
+ ├── splits
+ ├── train_files.txt
+ ├── val_day_files.txt
+ └── val_night_files.txt
+ └── oxford_processing_forADDS
+ ├── day_train_all/ #Day training image folder (from day_train_all.7z.001 ~ day_train_all.7z.002)
+ ├── night_train_all/ #Night training image folder (from night_train_all.7z.001 ~ day_train_all.7z.002)
+ ├── day_val_451/ #Daytime verification image folder (from day_val_451.7z)
+ ├── day_val_451_gt/ #Daytime verification depth truth value folder (from day_val_451_gt.7z)
+ ├── night_val_411/ #night verification image folder (from night_val_411.7z)
+ └── night_val_411_gt/ #Night verification depth truth value folder (from night_val_411_gt.7z)
+
+```
+
+The sequences used for training and verification are consistent with the foregoing.
diff --git a/docs/en/dataset/SegmentationDataset.md b/docs/en/dataset/SegmentationDataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f67fb2725c7557525e2944ac54a13b749b6bdb1
--- /dev/null
+++ b/docs/en/dataset/SegmentationDataset.md
@@ -0,0 +1,35 @@
+English | [简体中文](../../zh-CN/dataset/SegmentationDataset.md)
+
+# Video Action Segmentation Dataset
+
+The video motion segmentation model uses breakfast, 50salads and gtea data sets. The use method is to use the features extracted by the pre training model, which can be obtained from the ms-tcn official code base.[feat](https://zenodo.org/record/3625992#.Xiv9jGhKhPY)
+
+- Dataset tree
+```txt
+─── gtea
+ ├── features
+ │ ├── S1_Cheese_C1.npy
+ │ ├── S1_Coffee_C1.npy
+ │ ├── S1_CofHoney_C1.npy
+ │ └── ...
+ ├── groundTruth
+ │ ├── S1_Cheese_C1.txt
+ │ ├── S1_Coffee_C1.txt
+ │ ├── S1_CofHoney_C1.txt
+ │ └── ...
+ ├── splits
+ │ ├── test.split1.bundle
+ │ ├── test.split2.bundle
+ │ ├── test.split3.bundle
+ │ └── ...
+ └── mapping.txt
+```
+
+- data tree
+```txt
+─── data
+ ├── 50salads
+ ├── breakfast
+ ├── gtea
+ └── ...
+```
diff --git a/docs/en/dataset/fsd.md b/docs/en/dataset/fsd.md
new file mode 100644
index 0000000000000000000000000000000000000000..2f16a485ad0adb0d2b020de0a75ce12805c4db08
--- /dev/null
+++ b/docs/en/dataset/fsd.md
@@ -0,0 +1,55 @@
+[简体中文](../../zh-CN/dataset/fsd.md) | English
+
+# Figure Skating Dataset
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+
+---
+
+
+## Introduction
+
+In figure skating, compared with other sports, human posture and trajectory show the characteristics of strong complexity, which is helpful to the research of fine-grained action recognition tasks.
+
+For FSD Dataset, all video materials are collected from the Figure Skating Championships from 2017 to 2018. The frame rate of the video is uniformly standardized to 30 frames per second, and the image size is 1080 * 720 to ensure the relative consistency of the dataset. After that, we use the 2D pose estimation algorithm Open Pose to extract frame by frame key points from the video, and finally save the data in `.npy` format.
+
+The directory structure of training dataset and test dataset is as follows:
+
+```txt
+train_data.npy # 2922
+train_label.npy # 2922
+test_A_data.npy # 628
+test_B_data.npy # 634
+```
+
+`train_label.npy` can be read using `np.load()`, each element is an integer variable with a value between 0-29, representing the label of the action. `data.npy` can be read using `np.load()`, return a tensor with the shape of `N×C×T×V×M`, the specific meaning of each dimension is as follows:
+
+| Dimension | Size | Meaning | Notes |
+| :---- | :----: | :----: | :---- |
+| N | N | Number of samples | - |
+| C | 3 | The coordinates and confidence of each joint point respectively | rescale to -1~1 |
+| T | 1500 | The duration of the action | The actual length of some actions may be less than 1500, in such case we will pad 0 to ensure the unity of T dimension. |
+| V | 25 | Number of joint points | See the skeleton example below for the meaning of specific joint points. |
+| M | 1 | Number of athletes | - |
+
+
+skeleton example:
+
+
+

+
+
+
+
+## Download
+
+You can get the download link after registering on the [competition homepage](https://www.datafountain.cn/competitions/519).
+
+| Set | Data | Label |
+| :---- | :----: | :----: |
+| Train | [train_data.npy](https://videotag.bj.bcebos.com/Data/FSD_train_data.npy) | [train_label.npy](https://videotag.bj.bcebos.com/Data/FSD_train_label.npy) |
+| TestA | comming soon | comming soon |
+
+
+> RGB datasets would not be provided for copyright reasons.
diff --git a/docs/en/dataset/k400.md b/docs/en/dataset/k400.md
new file mode 100644
index 0000000000000000000000000000000000000000..539735513fec907f34b23fcb6d72ca9fbf023e89
--- /dev/null
+++ b/docs/en/dataset/k400.md
@@ -0,0 +1,78 @@
+[简体中文](../../zh-CN/dataset/k400.md) | English
+
+# Kinetics-400 Preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+- [Frames](#Frames)
+
+---
+
+
+## Introduction
+
+Kinetics-400 is a commonly used benchmark dataset in the video field. Please refer to its official website [Kinetics](https://deepmind.com/research/open-source/kinetics) for details. You can refer to the official address [ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics), and use the download script provided to download the dataset.
+
+## Download
+
+Considering the difficulty of downloading the K400 data set, we provide two download methods: (1) Baidu network disk download (2) Script download
+
+### Baidu SkyDrive Download
+
+Netdisk link: https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg
+Extraction code: `ppvi`
+
+### Script download
+
+- Download the training set link list file [train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list) and the validation set link list file [val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list).
+
+Write the download script `download.sh` as follows:
+
+```bash
+file=$1
+
+while read line
+do
+ wget "$line"
+done <$file
+```
+
+Download training set command:
+```bash
+bash download.sh train_link.list
+```
+
+Download verification set command:
+```bash
+bash download.sh val_link.list
+```
+
+---
+
+|category | Number of data | list file |
+| :------: | :----------: | :----: |
+|Training set | 234619 | [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|
+|Validation set | 19761 | [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|
+
+- After downloading, unzip and add the data path to list file.
+
+- Due to the failure of some video link, part of original data is missing. This copies need about 135G of storage space.
+
+> This copies is only used for academic research. If it is helpful to you, welcome to star [our project](https://github.com/PaddlePaddle/PaddleVideo)
+
+
+## Frames
+In order to speed up the training process of the network, we first extract frames from the video file (K400 video file is in mp4 format). Compared with the method of network training directly through video files, the method of frames can greatly accelerate the speed of network training。
+
+Enter the following command to extract the frames of the K400 video file
+
+```python
+python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4
+```
+
+After the video file frames are extracted, they will be stored in the specified `./rawframes` path, and the size is about 2T.
+
+|category | Number of data | list file |
+| :------: | :----------: | :----: |
+|Training set | 234619 | [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|
+|Validation set | 19761 | [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|
diff --git a/docs/en/dataset/msrvtt.md b/docs/en/dataset/msrvtt.md
new file mode 100644
index 0000000000000000000000000000000000000000..390ba9d0090126e50c89584eebbe5a3352bfaa23
--- /dev/null
+++ b/docs/en/dataset/msrvtt.md
@@ -0,0 +1,79 @@
+[简体中文](../../zh-CN/dataset/msrvtt.md) | English
+
+# MSR-VTT Preparation
+
+- [Introduction](#1.1)
+- [Download for T2VLAD](#1.2)
+- [Download for ActBERT](#1.3)
+- [Reference](#1.4)
+
+
+
+## Introduction
+
+MSR-VTT(Microsoft Research Video to Text) is a large-scale dataset containing videos and subtitles, which is composed of 10000 video clips from 20 categories, and each video clip is annotated with 20 English sentences. We used 9000 video clips for training and 1000 for testing. For more details, please refer to the website: [MSRVTT](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)
+
+
+## Download for T2VLAD
+
+[T2VLAD doc](../../../applications/T2VLAD/README_en.md)
+
+For ease of use, we provided extracted features of video.
+
+First, make sure to enter the following command in the `applications/T2VLAD/data` directory to download the dataset.
+
+```bash
+bash download_features.sh
+```
+
+After downloading, the files in the data directory are organized as follows:
+
+```
+├── data
+| ├── MSR-VTT
+| │ ├── raw-captions.pkl
+| │ ├── train_list_jsfusion.txt
+| │ ├── val_list_jsfusion.txt
+| │ ├── aggregated_text_feats
+| | | ├── w2v_MSRVTT_openAIGPT.pickle
+| | ├── mmt_feats
+| │ │ ├── features.audio.pkl
+| │ │ ├── features.face_agg.pkl
+| │ │ ├── features.flos_agg.pkl
+| │ │ ├── features.ocr.pkl
+| │ │ ├── features.rgb_agg.pkl
+| │ │ ├── features.s3d.pkl
+| │ │ ├── features.scene.pkl
+| │ │ ├── features.speech.pkl
+
+```
+
+## Download for ActBERT
+
+[ActBERT doc](../model_zoo/multimodal/actbert.md)
+
+Download data features:
+```
+wget https://videotag.bj.bcebos.com/Data/ActBERT/msrvtt_test.lmdb.tar
+wget https://videotag.bj.bcebos.com/Data/ActBERT/MSRVTT_JSFUSION_test.csv
+```
+
+Decompress the `msrvtt_test.lmdb.tar`:
+```
+tar -zxvf msrvtt_test.lmdb.tar
+```
+
+The files in the data directory are organized as follows:
+
+```
+├── data
+| ├── MSR-VTT
+| │ ├── MSRVTT_JSFUSION_test.csv
+| │ ├── msrvtt_test.lmdb
+| │ ├── data.mdb
+| │ ├── lock.mdb
+```
+
+
+## Reference
+- Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. Multi-modal transformer for video retrieval. In ECCV, 2020.
diff --git a/docs/en/dataset/ntu-rgbd.md b/docs/en/dataset/ntu-rgbd.md
new file mode 100644
index 0000000000000000000000000000000000000000..0b0056b7d040fce85ecff8d766967bd9eaca447d
--- /dev/null
+++ b/docs/en/dataset/ntu-rgbd.md
@@ -0,0 +1,158 @@
+[简体中文](../../zh-CN/dataset/ntu-rgbd.md) | English
+
+# NTU-RGB+D Preparation
+
+- [Introduction](#Introduction)
+- [ST-GCN Data Prepare](#ST-GCN_Data_Prepare)
+- [CTR-GTCN Data Prepare](#CTR-GCN_Data_Prepare)
+
+---
+
+
+## Introduction
+
+NTU-RGB+D contains 60 action classes and 56,880 video samples for skeleton-based action recognition. Please refer to its official website[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) for more details.
+
+The dataset contains two splits when dividing the training set and test set. For Cross-subject, the dataset is divided according to character id, with 40320 samples in training set and 16560 samples in test set. For Cross-view, the dataset is divided according to camera division. The samples collected by cameras 2 and 3 are training sets, including 37930 samples, and the samples collected by camera 1 are test sets, including 18960 samples.
+
+## ST-GCN_Data_Prepare
+
+ST-GCN data prepare preceduce are introducted follow.
+
+### Download
+We provide the download link of the processed dataset [NTU-RGB-D.tar](https://videotag.bj.bcebos.com/Data/NTU-RGB-D.tar)(~3.1G). Please download and unzip with ```tar -zxvf NTU-RGB-D.tar ``` , the directory structure is as follows:
+
+```txt
+─── NTU-RGB-D
+ ├── xsub
+ │ ├── train_data.npy
+ │ ├── train_label.pkl
+ │ ├── val_data.npy
+ │ └── val_label.pkl
+ └── xview
+ ├── train_data.npy
+ ├── train_label.pkl
+ ├── val_data.npy
+ └── val_label.pkl
+```
+
+> This is a copies from [st-gcn](https://github.com/open-mmlab/mmskeleton/blob/master/doc/SKELETON_DATA.md).
+
+## CTR-GCN_Data_Prepare
+
+CTR-GCN data prepare preceduce are introducted follow.
+
+### Download
+
+There is script `download_dataset.sh` to download the dataset from official website [NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) in dictory `data\ntu-rgb-d`.
+
+```bash
+sh data/ntu-rgb-d/download_dataset.sh
+```
+
+File tree:
+```txt
+─── ntu-rgb-d
+ ├── download_dataset.sh
+ ├── nturgb+d_skeletons
+ │ ├── S001C001P001R001A001.skeleton
+ │ ├── S001C001P001R001A002.skeleton
+ │ ├── S001C001P001R001A003.skeleton
+ │ ├── S001C001P001R001A004.skeleton
+ │ ├── S001C001P001R001A005.skeleton
+ │ ├── S001C001P001R001A006.skeleton
+ │ ├── S001C001P001R001A007.skeleton
+ │ ├── ....
+ │ └── S017C003P020R002A060.skeleton
+ ├── get_raw_denoised_data.py
+ ├── get_raw_skes_data.py
+ ├── seq_transformation.py
+ └── statistics
+ ├── camera.txt
+ ├── label.txt
+ ├── performer.txt
+ ├── replication.txt
+ ├── setup.txt
+ └── skes_available_name.txt
+
+```
+
+### Prepare
+
+run follow script, then data will be precessed to the data format need by CTR-GCN.
+
+> Note:if make dataset by yourself, please prepare `data/ntu-rgb-d/statistics/skes_available_name.txt`, which is the list of skeletons files that will be precessed.
+
+```bash
+cd ./data/ntu-rgb-d
+# Get skeleton of each performer
+python get_raw_skes_data.py
+# Remove the bad skeleton
+python get_raw_denoised_data.py
+# Transform the skeleton to the center of the first frame
+python seq_transformation.py
+```
+
+File tree:
+
+```txt
+─── ntu-rgb-d
+ ├── download_dataset.sh
+ ├── nturgb+d_skeletons
+ │ ├── S001C001P001R001A001.skeleton
+ │ ├── S001C001P001R001A002.skeleton
+ │ ├── S001C001P001R001A003.skeleton
+ │ ├── S001C001P001R001A004.skeleton
+ │ ├── S001C001P001R001A005.skeleton
+ │ ├── S001C001P001R001A006.skeleton
+ │ ├── S001C001P001R001A007.skeleton
+ │ ├── ....
+ │ └── S017C003P020R002A060.skeleton
+ ├── denoised_data
+ │ ├── actors_info
+ │ │ ├── S001C001P001R001A024.txt
+ │ │ ├── S001C001P001R001A025.txt
+ │ │ ├── S001C001P001R001A026.txt
+ │ │ ├── ....
+ │ │ ├── S017C003P020R002A059.txt
+ │ │ └── S017C003P020R002A060.txt
+ │ ├── denoised_failed_1.log
+ │ ├── denoised_failed_2.log
+ │ ├── frames_cnt.txt
+ │ ├── missing_skes_1.log
+ │ ├── missing_skes_2.log
+ │ ├── missing_skes.log
+ │ ├── noise_length.log
+ │ ├── noise_motion.log
+ │ ├── noise_spread.log
+ │ ├── raw_denoised_colors.pkl
+ │ ├── raw_denoised_joints.pkl
+ │ └── rgb+ske
+ ├── raw_data
+ │ ├── frames_cnt.txt
+ │ ├── frames_drop.log
+ │ ├── frames_drop_skes.pkl
+ │ └── raw_skes_data.pkl
+ ├── get_raw_denoised_data.py
+ ├── get_raw_skes_data.py
+ ├── seq_transformation.py
+ ├── statistics
+ │ ├── camera.txt
+ │ ├── label.txt
+ │ ├── performer.txt
+ │ ├── replication.txt
+ │ ├── setup.txt
+ │ └── skes_available_name.txt
+ ├── xview
+ │ ├── train_data.npy
+ │ ├── train_label.pkl
+ │ ├── val_data.npy
+ │ └── val_label.pkl
+ └── xsub
+ ├── train_data.npy
+ ├── train_label.pkl
+ ├── val_data.npy
+ └── val_label.pkl
+```
+
+> Note:dictory `denoised_data`、`raw_data`and`nturgb+d_skeletons`, that are temporal files, can be deleted, if extracted `xview` and `xsub`.
diff --git a/docs/en/dataset/ucf101.md b/docs/en/dataset/ucf101.md
new file mode 100644
index 0000000000000000000000000000000000000000..478a306c47fef3a82998211f9d8dc3517054a23c
--- /dev/null
+++ b/docs/en/dataset/ucf101.md
@@ -0,0 +1,86 @@
+# UCF101数据准备
+UCF101数据的相关准备。主要包括UCF101的video文件下载,video文件提取frames,以及生成文件的路径list。
+
+---
+## 1. 数据下载
+UCF101数据的详细信息可以参考网站[UCF101](https://www.crcv.ucf.edu/data/UCF101.php)。 为了方便用户使用,我们提供了UCF101数据的annotations文件和videos文件的下载脚本。
+
+### 下载annotations文件
+首先,请确保在`./data/dataset/ucf101/`目录下,输入如下UCF101数据集的标注文件的命令。
+```shell
+bash download_annotations.sh
+```
+
+### 下载UCF101的视频文件
+同样需要确保在`./data/dataset/ucf101/`目录下,输入下述命令下载视频文件
+```shell
+bash download_videos.sh
+```
+下载完成后视频文件会存储在`./data/dataset/ucf101/videos/`文件夹下,视频文件大小为6.8G。
+
+---
+## 2. 提取视频文件的frames
+为了加速网络的训练过程,我们首先对视频文件(ucf101视频文件为avi格式)提取帧 (frames)。相对于直接通过视频文件进行网络训练的方式,frames的方式能够加快网络训练的速度。
+
+直接输入如下命令,即可提取ucf101视频文件的frames
+``` python
+python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext avi
+```
+视频文件frames提取完成后,会存储在`./rawframes`文件夹下,大小为56G。
+
+---
+## 3. 生成frames文件和视频文件的路径list
+生成视频文件的路径list,输入如下命令
+
+```python
+python build_ucf101_file_list.py videos/ --level 2 --format videos --out_list_path ./
+```
+生成frames文件的路径list,输入如下命令:
+```python
+python build_ucf101_file_list.py rawframes/ --level 2 --format rawframes --out_list_path ./
+```
+
+**参数说明**
+
+`videos/` 或者 `rawframes/` : 表示视频或者frames文件的存储路径
+
+`--level 2` : 表示文件的存储结构
+
+`--format`: 表示是针对视频还是frames生成路径list
+
+`--out_list_path `: 表示生的路径list文件存储位置
+
+
+# 以上步骤完成后,文件组织形式如下所示
+
+```
+├── data
+| ├── dataset
+| │ ├── ucf101
+| │ │ ├── ucf101_{train,val}_split_{1,2,3}_rawframes.txt
+| │ │ ├── ucf101_{train,val}_split_{1,2,3}_videos.txt
+| │ │ ├── annotations
+| │ │ ├── videos
+| │ │ │ ├── ApplyEyeMakeup
+| │ │ │ │ ├── v_ApplyEyeMakeup_g01_c01.avi
+|
+| │ │ │ ├── YoYo
+| │ │ │ │ ├── v_YoYo_g25_c05.avi
+| │ │ ├── rawframes
+| │ │ │ ├── ApplyEyeMakeup
+| │ │ │ │ ├── v_ApplyEyeMakeup_g01_c01
+| │ │ │ │ │ ├── img_00001.jpg
+| │ │ │ │ │ ├── img_00002.jpg
+| │ │ │ │ │ ├── ...
+| │ │ │ │ │ ├── flow_x_00001.jpg
+| │ │ │ │ │ ├── flow_x_00002.jpg
+| │ │ │ │ │ ├── ...
+| │ │ │ │ │ ├── flow_y_00001.jpg
+| │ │ │ │ │ ├── flow_y_00002.jpg
+| │ │ │ ├── ...
+| │ │ │ ├── YoYo
+| │ │ │ │ ├── v_YoYo_g01_c01
+| │ │ │ │ ├── ...
+| │ │ │ │ ├── v_YoYo_g25_c05
+
+```
diff --git a/docs/en/dataset/youtube8m.md b/docs/en/dataset/youtube8m.md
new file mode 100644
index 0000000000000000000000000000000000000000..77c6860422a15f152a26c722ba2d85f391c8993a
--- /dev/null
+++ b/docs/en/dataset/youtube8m.md
@@ -0,0 +1,56 @@
+English | [简体中文](../../zh-CN/dataset/youtube8m.md)
+
+# YouTube-8M Data Preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+- [Conversion](#Conversion)
+
+
+## Introduction
+
+YouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.
+
+**The length of each video is between 120s and 500s
+Due to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **
+> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).
+
+
+## Download
+1. Create a new directory for storing features (take the PaddleVideo directory as an example)
+ ```bash
+ cd data/yt8m
+ mkdir frame
+ cd frame
+ ```
+2. Download the training and validation set to the frame folder
+ ```bash
+ curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
+ curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
+ ```
+ The download process is shown in the figure
+ 
+
+ After the data download is complete, you will get 3844 training data files and 3844 verification data files (TFRecord format)
+
+## Conversion
+1. Install tensorflow to read tfrecord data
+ ```bash
+ python3.7 -m pip install tensorflow-gpu==1.14.0
+ ```
+2. Convert the downloaded TFRecord file into a pickle file for PaddlePaddle to use
+ ```bash
+ cd .. # From the frame directory back to the yt8m directory
+ python3.7 tf2pkl.py ./frame ./pkl_frame/ # Convert train*.tfrecord and validate*.tfrecord in the frame folder to pkl format
+ ```
+3. Generate a single pkl file path set, and split pkl into multiple small pkl files based on this file, and generate the final split pkl file path required
+ ```bash
+ ls pkl_frame/train*.pkl> train.list # Write the path of train*.pkl to train.list
+ ls pkl_frame/validate*.pkl> val.list # Write the path of validate*.pkl into val.list
+
+ python3.7 split_yt8m.py train.list # Split each train*.pkl into multiple train*_split*.pkl
+ python3.7 split_yt8m.py val.list # Split each validate*.pkl into multiple validate*_split*.pkl
+
+ ls pkl_frame/train*_split*.pkl> train.list # Rewrite the path of train*_split*.pkl into train.list
+ ls pkl_frame/validate*_split*.pkl> val.list # Rewrite the path of validate*_split*.pkl into val.list
+ ```
diff --git a/docs/en/install.md b/docs/en/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0d3bb239c5b01e48aaf6ad3007222cea4035b56
--- /dev/null
+++ b/docs/en/install.md
@@ -0,0 +1,71 @@
+[简体中文](../zh-CN/install.md) | English
+
+# Installation
+
+---
+
+- [Introduction](#Introduction)
+- [Install PaddlePaddle](#Install-PaddlePaddle)
+- [Install PaddleVideo](#Install-PaddleVideo)
+
+## Introduction
+
+This document introduces how to install PaddlePaddle、PaddleVideo and its requirements.
+
+## Install PaddlePaddle
+
+Python 3.7, CUDA 10.1, CUDNN7.6.4 nccl2.1.2 and later version are required at first, For now, PaddleVideo only support training on the GPU device. Please follow the instructions in the [Installation](http://www.paddlepaddle.org.cn/install/quick) if the PaddlePaddle on the device is lower than v2.0
+
+**Install PaddlePaddle**
+
+```bash
+pip3 install paddlepaddle-gpu --upgrade
+```
+
+or compile from source code, please refer to [Installation](http://www.paddlepaddle.org.cn/install/quick).
+
+Verify Installation
+
+```python
+import paddle
+paddle.utils.run_check()
+```
+
+Check PaddlePaddle version:
+
+```bash
+python3 -c "import paddle; print(paddle.__version__)"
+```
+
+Note:
+- Make sure the compiled version is later than PaddlePaddle2.0.
+- Indicate **WITH_DISTRIBUTE=ON** when compiling, Please refer to [Instruction](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3) for more details.
+- When running in the docker, in order to ensure that the container has enough shared memory for data read acceleration of Paddle, please set the parameter `--shm_size=32g` at creating a docker container, if conditions permit, you can set it to a larger value.
+
+---
+
+## Install PaddleVideo
+
+**Clone PaddleVideo:**
+
+```bash
+cd path_to_clone_PaddleVideo
+git clone https://github.com/PaddlePaddle/PaddleVideo.git
+```
+
+**Install requirements**
+
+```bash
+python3.7 -m pip install --upgrade pip
+pip3.7 install --upgrade -r requirements.txt
+```
+
+**Install python package**
+
+Install PaddleVideo via pip WIP
+
+**Install docker**
+
+Install PaddleVideo via docker WIP
+
+
diff --git a/docs/en/model_zoo/README.md b/docs/en/model_zoo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..24ad7badafb550fff862b435b98e20c211cf143e
--- /dev/null
+++ b/docs/en/model_zoo/README.md
@@ -0,0 +1,40 @@
+[简体中文](../../zh-CN/model_zoo/README.md) | English
+
+# Introduction
+
+We implemented action recgonition model and action localization model in this repo.
+
+## Model list
+
+| Field | Model | Config | Dataset | Metrics | ACC% | Download |
+| :--------------- | :--------: | :------------: | :------------: | :------------: | :------------: | :------------: |
+| action recognition | [**PP-TSM**](./recognition/pp-tsm.md) | [pptsm.yaml](../../../configs/recognition/pptsm/pptsm_k400_frames_dense.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 76.16 | [PPTSM.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |
+| action recognition | [**PP-TSN**](./recognition/pp-tsn.md) | [pptsn.yaml](../../../configs/recognition/pptsn/pptsn_k400_videos.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 75.06 | [PPTSN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |
+| action recognition | [**PP-TimeSformer**](./recognition/pp-timesformer.md) | [pptimesformer.yaml](../../../configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |
+| action recognition | [AGCN](./recognition/agcn.md) | [agcn.yaml](../../../configs/recognition/agcn/agcn_fsd.yaml) | [FSD](../dataset/fsd.md) | Top-1 | 62.29 | [AGCN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams) |
+| action recognition | [ST-GCN](./recognition/stgcn.md) | [stgcn.yaml](../../../configs/recognition/stgcn/stgcn_fsd.yaml) | [FSD](../dataset/fsd.md) | Top-1 | 59.07 | [STGCN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |
+| action recognition | [VideoSwin](./recognition/videoswin.md) | [videoswin.yaml](../../../configs/recognition/videoswin/videoswin_k400_videos.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 82.40 | [VideoSwin.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams) |
+| action recognition | [TimeSformer](./recognition/timesformer.md) | [timesformer.yaml](../../../configs/recognition/timesformer/timesformer_k400_videos.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 77.29 | [TimeSformer.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |
+| action recognition | [SlowFast](./recognition/slowfast.md) | [slowfast_multigrid.yaml](../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 75.84 | [SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |
+| action recognition | [TSM](./recognition/tsm.md) | [tsm.yaml](../../../configs/recognition/tsm/tsm_k400_frames.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 70.86 | [TSM.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams) |
+| action recognition | [TSN](./recognition/tsn.md) | [tsn.yaml](../../../configs/recognition/tsn/tsn_k400_frames.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 69.81 | [TSN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |
+| action recognition | [AttentionLSTM](./recognition/attention_lstm.md) | [attention_lstm.yaml](../../../configs/recognition/attention_lstm/attention_lstm.yaml) | [Youtube-8M](../dataset/youtube8m.md) | Hit@1 | 89.0 | [AttentionLstm.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/AttentionLstm/AttentionLstm.pdparams) |
+| action detection| [BMN](./localization/bmn.md) | [bmn.yaml](../../../configs/localization/bmn.yaml) | [ActivityNet](../dataset/ActivityNet.md) | AUC | 67.23 | [BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams) |
+| shot boundary detection | [TransNetV2](./partition/transnetv2.md) | [transnetv2.yaml](../../../configs/partitioners/transnetv2/transnetv2.yaml) | ClipShots | F1 scores | 76.1 | |
+| monocular depth estimation | [ADDS](./estimation/adds.md) | [adds.yaml](../../../configs/estimation/adds/adds.yaml) | Oxford_RobotCar | Abs Rel | 0.209 | [ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams) |
+
+
+# Reference
+
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
+- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
+- [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383v1), Ji Lin, Chuang Gan, Song Han
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf) Gedas Bertasius, Heng Wang, Lorenzo Torresani
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
+- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tomáš Souček, Jakub Lokoč
+- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Lina Liu, Xibin Song, Mengmeng Wang
+
diff --git a/docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md b/docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b100f442859748f5441c77b80713fb9681344a28
--- /dev/null
+++ b/docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md
@@ -0,0 +1,129 @@
+[简体中文](../../../zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md) | English
+
+# SlowFast_FasterRCNN
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install moviepy
+python -m pip install et_xmlfile
+python -m pip install paddledet
+```
+
+## Introduction
+
+The [SlowFast](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md) model is one of the high-precision models in the video field. For action detection task, it is also neccessary to detect the person in current frame. Therefore, the SlowFast_FasterRCNN model takes human detection results and video frames as input, extracts spatiotemporal features through the SlowFast model, and then uses FasterRCNN's head gets the actions and positions of humans in the frame.
+
+The corresponding AI Studio Notebook Link:[基于SlowFast+FasterRCNN的动作识别](https://aistudio.baidu.com/aistudio/projectdetail/3267637?contributionType=1)
+
+For details, please refer to the paper [SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf).
+
+## Data
+
+We use [AVA dataset](https://research.google.com/ava/download.html) for action detection. The AVA v2.2 dataset contains 430 videos split into 235 for training, 64 for validation, and 131 for test. Each video has 15 minutes annotated in 1 second intervals.
+
+### 1 Dowload Videos
+```
+bash download_videos.sh
+```
+
+### 2 Download Annotations
+```
+bash download_annotations.sh
+```
+
+### 3 Download Proposals
+
+```
+bash fetch_ava_proposals.sh
+```
+
+### 4 Cut Videos
+
+```
+bash cut_videos.sh
+```
+
+### 5 Extract Frames
+
+```
+bash extract_rgb_frames.sh
+```
+
+For AVA v2.1, there is a simple introduction to some key files:
+* 'ava_videos_15min_frames' dir stores video frames extracted with FPS as the frame rate;
+* 'ava_train_v2.1.csv' file stores the trainning annotations;
+* 'ava_train_excluded_timestamps_v2.1.csv' file stores excluded timestamps;
+* 'ava_dense_proposals_train.FAIR.recall_93.9.pkl' file stores humans' bboxes and scores of key frames;
+* 'ava_action_list_v2.1_for_activitynet_2018.pbtxt' file stores为 action list.
+
+## Train
+
+* `-c`: config file path;
+* `-w`: weights of model. The pretrained model can be downloaded from the table below;
+* `--validate`: evaluate model during training.
+
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=logdir.ava main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava.yaml
+```
+
+## Test
+
+Test model based on the best model:
+```
+python main.py --test \
+ -w output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams \
+ -c configs/detection/ava/ava.yaml
+```
+
+
+| architecture | depth | Pretrain Model | frame length x sample rate | MAP | AVA version | model |
+| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- |
+| SlowFast | R50 | [Kinetics 400](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) | 8 x 8 | 23.2 | 2.1 | [`link`](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SlowFastRCNN_AVA.pdparams) |
+
+
+## Inference
+
+The action detection of this project is divided into two stages. In the first stage, humans' proposals are obtained, and then input into the SlowFast+FasterRCNN model for action recognition.
+
+For human detection,you can use the trained model in [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).
+
+Install PaddleDetection:
+```
+cd PaddleDetection/
+pip install -r requirements.txt
+!python setup.py install
+```
+
+Download detection model:
+```
+# faster_rcnn_r50_fpn_1x_coco as an example
+wget https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams
+```
+
+export model:
+```
+python tools/export_model.py \
+ -c configs/detection/ava/ava.yaml \
+ -o inference_output \
+ -p output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams
+```
+
+inference based on the exported model:
+```
+python tools/predict.py \
+ -c configs/detection/ava/ava.yaml \
+ --input_file "data/-IELREHXDEMO.mp4" \
+ --model_file "inference_output/AVA_SlowFast_FastRcnn.pdmodel" \
+ --params_file "inference_output/AVA_SlowFast_FastRcnn.pdiparams" \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
diff --git a/docs/en/model_zoo/estimation/adds.md b/docs/en/model_zoo/estimation/adds.md
new file mode 100644
index 0000000000000000000000000000000000000000..c055db59d066357f5f16e99f163d60be33e2e3a3
--- /dev/null
+++ b/docs/en/model_zoo/estimation/adds.md
@@ -0,0 +1,133 @@
+[Simplified Chinese](../../../zh-CN/model_zoo/estimation/adds.md) | English
+
+# ADDS-DepthNet model
+
+## content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install scikit-image
+python -m pip install matplotlib
+```
+
+## Introduction
+
+This model is based on the ICCV 2021 paper **[Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628)** of Baidu Robotics and Autonomous Driving Laboratory,
+The self-supervised monocular depth estimation model based on day and night images is reproduced, which utilizes the complementary nature of day and night image data, and slows down the large domain shift of day and night images and the accuracy of depth estimation caused by lighting changes. Impact, the most advanced depth estimation results of all-sky images have been achieved on the challenging Oxford RobotCar data set.
+
+
+## Data
+
+For data download and preparation of Oxford RobotCar dataset, please refer to [Oxford RobotCar dataset data preparation](../../dataset/Oxford_RobotCar.md)
+
+
+## Train
+
+### Oxford RobotCar dataset training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [resnet18.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams) as Backbone initialization parameters, or download through the wget command
+
+ ```bash
+ wget -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams
+ ```
+
+2. Open `PaddleVideo/configs/estimation/adds/adds.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+ ```yaml
+ MODEL: #MODEL field
+ framework: "DepthEstimator" #Mandatory, indicate the type of network, associate to the'paddlevideo/modeling/framework/'.
+ backbone: #Mandatory, indicate the type of backbone, associate to the'paddlevideo/modeling/backbones/'.
+ name: 'ADDS_DepthNet'
+ pretrained: fill in the path here
+ ```
+
+#### Start training
+
+- The Oxford RobotCar dataset uses a single card for training, and the starting command for the training method is as follows:
+
+ ```bash
+ python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20
+ ```
+
+
+## Test
+
+- The ADDS-DepthNet model is verified synchronously during training (only the day or night data is verified). You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+ ```bash
+ Already save the best model (rmse)8.5531
+ ```
+
+- Because the model can only test one day or night data set at a given path in the yaml file at a time, to get the complete test score at the beginning of this document, you need to run 4 test commands and record their indicators ( 40m during the day, 60m during the day, 40m at night, 60m at night)
+
+- Download URL of the trained model: [ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams)
+
+- The test commands are as follows:
+
+ ```bash
+ # Night 40m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=40
+
+ # Night 60m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=60
+
+ # Daytime 40m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=40
+
+ # Daytime 60m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=60
+ ```
+
+ The test indicators on the validation dataset of Oxford RobotCar dataset are as follows:
+
+ | version | Max Depth | Abs Rel | Sq Rel | RMSE | RMSE log |
|
|
|
+ | ----------- | --------- | ------- | ------ | ----- | ------- | ----------------- |------------------- | ------------------- |
+ | ours(night) | 40 | 0.209 | 1.741 | 6.031 | 0.243 | 0.708 | 0.923 | 0.975 |
+ | ours(night) | 60 | 0.207 | 2.052 | 7.888 | 0.258 | 0.686 | 0.909 | 0.970 |
+ | ours(day) | 40 | 0.114 | 0.574 | 3.411 | 0.157 | 0.860 | 0.977 | 0.993 |
+ | ours(day) | 60 | 0.119 | 0.793 | 4.842 | 0.173 | 0.838 | 0.967 | 0.991 |
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/estimation/adds/adds.yaml -p data/ADDS_car.pdparams -o inference/ADDS
+```
+
+The above command will generate the model structure file `ADDS.pdmodel` and model weight files `ADDS.pdiparams` and `ADDS.pdiparams.info` files needed for prediction, all of which are stored in the `inference/ADDS/` directory
+
+For the meaning of each parameter in the above bash command, please refer to [Model Inference Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/en/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### Use predictive engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.png \
+ --config configs/estimation/adds/adds.yaml \
+ --model_file inference/ADDS/ADDS.pdmodel \
+ --params_file inference/ADDS/ADDS.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+At the end of the inference, the depth map estimated by the model will be saved in pseudo-color by default.
+
+The following is a sample picture and the corresponding predicted depth map:
+
+
+
+
+
+
+## Reference
+
+- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun
diff --git a/docs/en/model_zoo/localization/bmn.md b/docs/en/model_zoo/localization/bmn.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb64c670607bc384aac27a05dfbe00c76de7e726
--- /dev/null
+++ b/docs/en/model_zoo/localization/bmn.md
@@ -0,0 +1,104 @@
+[简体中文 ](../../../zh-CN/model_zoo/localization/bmn.md) | English
+
+# BMN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+BMN model contains three modules: Base Module handles the input feature sequence, and out- puts feature sequence shared by the following two modules; Temporal Evaluation Module evaluates starting and ending probabilities of each location in video to generate boundary probability sequences; Proposal Evaluation Module con- tains the BM layer to transfer feature sequence to BM fea- ture map, and contains a series of 3D and 2D convolutional layers to generate BM confidence map.
+
+
+
+BMN Overview
+
+
+
+## Data
+
+We use ActivityNet dataset to train this model,data preparation please refer to [ActivityNet dataset](../../dataset/ActivityNet.md).
+
+
+## Train
+
+You can start training by such command:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_bmn main.py --validate -c configs/localization/bmn.yaml
+```
+
+
+## Test
+
+You can start testing by such command:
+
+```bash
+python main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00009.pdparams -o DATASET.test_batch_size=1
+```
+
+- For now, we only support testing with **single card** and `batch_size=1`.
+
+- Please download [activity\_net\_1\_3\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json) label file and specify the path to `METRIC.ground_truth_filename` in config file.
+
+- Args `-w` is used to specifiy the model path,you can download our model in [BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams)
+
+
+Test accuracy in ActivityNet1.3:
+
+| AR@1 | AR@5 | AR@10 | AR@100 | AUC |
+| :---: | :---: | :---: | :---: | :---: |
+| 33.26 | 49.48 | 56.86 | 75.19 | 67.23% |
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `BMN.pdmodel` and parameters file `BMN.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/localization/bmn.yaml \
+ -p data/BMN.pdparams \
+ -o inference/BMN
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example_feat.list \
+ --config configs/localization/bmn.yaml \
+ --model_file inference/BMN/BMN.pdmodel \
+ --params_file inference/BMN/BMN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```
+BMN Inference results of data/example_feat.npy :
+{'score': 0.7968077063560486, 'segment': [0.0, 122.9877]}
+{'score': 0.49097609519958496, 'segment': [12.423000000000002, 124.23]}
+{'score': 0.21395835280418396, 'segment': [39.7536, 122.9877]}
+{'score': 0.2106524258852005, 'segment': [0.0, 109.3224]}
+{'score': 0.06876271963119507, 'segment': [23.6037, 114.2916]}
+```
+
+Inference results are saved in `data/bmn/BMN_INFERENCE_results`.
+
+## Reference
+
+- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
diff --git a/docs/en/model_zoo/multimodal/actbert.md b/docs/en/model_zoo/multimodal/actbert.md
new file mode 100644
index 0000000000000000000000000000000000000000..f884a5e8f33667ddfdbc17591b96fc73caa822d0
--- /dev/null
+++ b/docs/en/model_zoo/multimodal/actbert.md
@@ -0,0 +1,98 @@
+[简体中文](../../../zh-CN/model_zoo/multimodal/actbert.md) | English
+
+# ActBERT
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Reference](#Reference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install paddlenlp
+python -m pip install lmdb
+```
+
+## Introduction
+
+Actbert is proposed by Baidu in CVPR2020 for multimodal pretrain task. It leverage global action information to cat- alyze mutual interactions between linguistic texts and local regional objects. This method introduce a TaNgled Transformer block (TNT) to encode three sources of information, i.e., global actions, local regional objects, and linguistic descriptions. ActBERT significantly outperforms the state- of-the-art in five downstream video-and-language tasks, i.e., text-video clip retrieval, video captioning, video question answering, action segmentation, and action step localization.
+
+
+
+
+
+
+## Data
+
+Please refer to Kinetics400 data download and preparation doc [HowTo100M-data](../../dataset/howto100m.md)
+
+Please refer to MSR-VTT data download and preparation doc [MSR-VTT-data](../../dataset/umsrvtt.md)
+
+
+## Train
+
+### Train on HowTo100M
+
+#### download pretrain-model
+
+Please download [bert-base-uncased](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams) as pretraind model:
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams
+```
+
+and add path to `MODEL.framework.backbone.pretrained` in config file as:
+
+```yaml
+MODEL:
+ framework: "ActBert"
+ backbone:
+ name: "BertForMultiModalPreTraining"
+ pretrained: your weight path
+```
+
+- We provide training option on small data, config file is for reference only.
+
+#### Start training
+
+- Train ActBERT on HowTo100M scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_actbert main.py --validate -c configs/multimodal/actbert/actbert.yaml
+```
+
+- AMP is useful for speeding up training:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_actbert main.py --amp --validate -c configs/multimodal/actbert/actbert.yaml
+```
+
+
+## Test
+
+- Evaluation performs on downstream task, i.e. text-video clip retrieval on MSR-VTT dataset, test accuracy can be obtained using scripts:
+
+```bash
+python3.7 main.py --test -c configs/multimodal/actbert/actbert_msrvtt.yaml -w Actbert.pdparams
+```
+
+
+Metrics on MSR-VTT:
+
+| R@1 | R@5 | R@10 | Median R | Mean R | checkpoints |
+| :------: | :----------: | :----: | :----: | :----: | :----: |
+| 8.6 | 31.2 | 45.5 | 13.0 | 28.5 | [ActBERT.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ActBERT.pdparams) |
+
+
+## Reference
+
+- [ActBERT: Learning Global-Local Video-Text Representations
+](https://arxiv.org/abs/2011.07231), Linchao Zhu, Yi Yang
diff --git a/docs/en/model_zoo/partition/transnetv2.md b/docs/en/model_zoo/partition/transnetv2.md
new file mode 100644
index 0000000000000000000000000000000000000000..e98e2c5702ebb2bbc90021bd9707d110e65c5dc0
--- /dev/null
+++ b/docs/en/model_zoo/partition/transnetv2.md
@@ -0,0 +1,80 @@
+[简体中文](../../../zh-CN/model_zoo/partition/transnetv2.md) | English
+
+# TransNetV2
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Details](#Details)
+- [Reference](#Reference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install ffmpeg-python==0.2.0
+```
+
+## Introduction
+
+TransNetV2 is a video segmentation model based on deep learning. It performs feature learning through the DDCNN V2 structure, and adds RGB color histograms and video frame similarity for more effective feature extraction, and finally obtains whether each frame is a shot boundary frame Probability, thereby completing the video segmentation. The algorithm has good effect and efficient calculation, which is very suitable for industrial landing.
+
+
+
+This code currently only supports model inference, and model training and testing will be provided in the future.
+
+Please refer to the paper for details. [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838)
+
+## Data
+
+coming soon
+
+
+## Train
+
+coming soon
+
+
+## Test
+
+coming soon
+
+
+## Inference
+
+
+Load the TransNetV2 weights trained on ClipShots and TRECVID IACC.3 dataset [TransNetV2_shots.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams), or download through the command line
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams
+```
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/partitioners/transnetv2/transnetv2.yaml -p data/TransNetV2_shots.pdparams -o inference/TransNetV2
+```
+
+The above command will generate the model structure file`TransNetV2.pdmodel`and the model weight file`TransNetV2.pdiparams`required for prediction.
+
+For the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/partitioners/transnetv2/transnetv2.yaml \
+ --model_file inference/TransNetV2/TransNetV2.pdmodel \
+ --params_file inference/TransNetV2/TransNetV2.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+By defining the `output_path` parameters in `transnetv2.yaml`, the prediction probability of each frame can be output to `{output_path}/example_predictions.txt`, and the predicted lens boundary is output to `{output_path}/example_scenes.txt`.
+By defining the `visualize` parameter in `transnetv2.yaml`, the predicted results can be visualized, and the visual results are saved to `{output_path}/example_vis.png`.
+
+## Reference
+
+- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tomáš Souček, Jakub Lokoč
diff --git a/docs/en/model_zoo/recognition/agcn.md b/docs/en/model_zoo/recognition/agcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..85677c729c07ba32908cd8cf5aac7664d5441b01
--- /dev/null
+++ b/docs/en/model_zoo/recognition/agcn.md
@@ -0,0 +1,129 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/agcn.md) | English
+
+# AGCN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+We implemented Adaptive Graph Convolution Network to improve the accuracy of [ST-GCN](./stgcn.md).
+
+## Data
+
+Please refer to FSD-10 data download and preparation doc [FSD](../../dataset/fsd.md)
+
+Please refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)
+
+## Train
+
+### Train on FSD
+
+- Train AGCN on FSD scripts:
+
+```bash
+python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml
+```
+
+- Turn off `valid` when training, as validation dataset is not available for the competition.
+
+### Train on NTU-RGBD
+
+- Train AGCN on NTU-RGBD scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_agcn main.py --validate -c configs/recognition/agcn/agcn_ntucs.yaml
+```
+
+- config file `agcn_ntucs.yaml` corresponding to the config of AGCN on NTU-RGB+D dataset with cross-subject splits.
+
+
+## Test
+
+### Test onf FSD
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/recognition/agcn/agcn_fsd.yaml -w output/AGCN/AGCN_epoch_00100.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).
+
+Accuracy on FSD dataset:
+
+| Test_Data | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| Test_A | 62.29 | [AGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams)|
+
+
+### Test on NTU-RGB+D
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/recognition/agcn/agcn_ntucs.yaml -w output/AGCN/AGCN_best.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+Accuracy on NTU-RGB+D dataset:
+
+| split | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| cross-subject | 83.27 | [AGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_ntucs.pdparams)|
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `AGCN.pdmodel` and parameters file `AGCN.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml \
+ -p data/AGCN_fsd.pdparams \
+ -o inference/AGCN
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
+ --config configs/recognition/agcn/agcn_fsd.yaml \
+ --model_file inference/AGCN/AGCN.pdmodel \
+ --params_file inference/AGCN/AGCN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.8965644240379333
+```
+
+
+## Reference
+
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
+
+- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+
+- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+
+- Many thanks to [li7819559](https://github.com/li7819559) and [ZhaoJingjing713](https://github.com/ZhaoJingjing713) for contributing the code.
diff --git a/docs/en/model_zoo/recognition/attention_lstm.md b/docs/en/model_zoo/recognition/attention_lstm.md
new file mode 100644
index 0000000000000000000000000000000000000000..42fb5d90e0abf6a1194a5e6adcf188c49e3294af
--- /dev/null
+++ b/docs/en/model_zoo/recognition/attention_lstm.md
@@ -0,0 +1,84 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/attention_lstm.md) | English
+
+# AttentionLSTM
+
+## content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+Recurrent Neural Networks (RNN) are often used in the processing of sequence data, which can model the sequence information of multiple consecutive frames of video, and are commonly used methods in the field of video classification.
+This model uses a two-way long and short-term memory network (LSTM) to encode all the frame features of the video in sequence. Unlike the traditional method that directly uses the output of the last moment of LSTM, this model adds an Attention layer, and the hidden state output at each moment has an adaptive weight, and then linearly weights the final feature vector. The reference paper implements a two-layer LSTM structure, while **this model implements a two-way LSTM with Attention**.
+
+The Attention layer can refer to the paper [AttentionCluster](https://arxiv.org/abs/1711.09550)
+
+## Data
+
+PaddleVide provides training and testing scripts on the Youtube-8M dataset. Youtube-8M data download and preparation please refer to [YouTube-8M data preparation](../../dataset/youtube8m.md)
+
+## Train
+
+### Youtube-8M data set training
+
+#### Start training
+
+- The Youtube-8M data set uses 8 cards for training. In the feature format, video and audio features will be used as input. The training start command of the data is as follows
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+ ```
+
+## Test
+
+The command is as follows:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm main.py --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml -w "output/AttentionLSTM/AttentionLSTM_best.pdparams"
+```
+
+When the test configuration uses the following parameters, the test indicators on the validation data set of Youtube-8M are as follows:
+
+| Hit@1 | PERR | GAP | checkpoints |
+| :-----: | :---------: | :---: | ----- |
+| 89.05 | 80.49 | 86.30 | [AttentionLSTM_yt8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams) |
+
+## Inference
+
+### Export inference model
+```bash
+python3.7 tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
+ -p data/AttentionLSTM_yt8.pdparams \
+ -o inference/AttentionLSTM
+```
+
+The above command will generate the model structure file `AttentionLSTM.pdmodel` and the model weight file `AttentionLSTM.pdiparams` required for prediction.
+
+For the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0.0/docs/en/start.md#2-infer)
+
+### Use prediction engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.pkl \
+ --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
+ --model_file inference/AttentionLSTM/AttentionLSTM.pdmodel \
+ --params_file inference/AttentionLSTM/AttentionLSTM.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+An example of the output is as follows:
+```bash
+Current video file: data/example.pkl
+ top-1 class: 11
+ top-1 score: 0.9841002225875854
+```
+It can be seen that using the AttentionLSTM model trained on Youtube-8M to predict data/example.pkl, the output top1 category id is 11, and the confidence is 0.98.
+## Reference paper
+
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan
diff --git a/docs/en/model_zoo/recognition/ctrgcn.md b/docs/en/model_zoo/recognition/ctrgcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..bdec0aacc7e2c7fbf389c2b8a402e5cab5c1a325
--- /dev/null
+++ b/docs/en/model_zoo/recognition/ctrgcn.md
@@ -0,0 +1,128 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/ctrgcn.md) | English
+
+# CTR-GCN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+[CTRGCN](https://github.com/Uason-Chen/CTR-GCN.git) is a bone based behavior recognition model proposed by iccv 2021. By applying the changes to the graph convolution of human bone data with topological structure, and using spatio-temporal graph convolution to extract spatio-temporal features for behavior recognition, the accuracy of bone based behavior recognition task is greatly improved.
+
+
+
+
+
+
+## Data
+
+Please refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)
+
+
+## Train
+
+
+### Train on NTU-RGBD
+
+- Train CTR-GCN on NTU-RGBD scripts using single gpu:
+
+```bash
+# joint modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml --seed 1
+
+# bone modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml --seed 1
+
+# motion modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml --seed 1
+
+# bone motion modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml --seed 1
+```
+
+- Train CTR-GCN on NTU-RGBD scriptsusing multi gpus:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_ctrgcn main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml
+```
+
+- config file `ctrgcn_ntucs_joint.yaml` corresponding to the config of CTR-GCN on NTU-RGB+D dataset with cross-subject splits.
+
+
+## Test
+
+### Test on NTU-RGB+D
+
+- Test scripts:
+
+```bash
+# joint modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml -w data/CTRGCN_ntucs_joint.pdparams
+
+# bone modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml -w data/CTRGCN_ntucs_bone.pdparams
+
+# motion modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml -w data/CTRGCN_ntucs_motion.pdparams
+
+# bone motion modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml -w data/CTRGCN_ntucs_bone_motion.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+
+Accuracy on NTU-RGB+D dataset:
+
+| split | modality | Top-1 | checkpoints |
+| :----: | :----: | :----: | :----: |
+| cross-subject | joint | 89.93 | [CTRGCN_ntucs_joint.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_joint.pdparams) |
+| cross-subject | bone | 85.24 | [CTRGCN_ntucs_bone.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone.pdparams) |
+| cross-subject | motion | 85.33 | [CTRGCN_ntucs_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_motion.pdparams) |
+| cross-subject | bone motion | 84.53 | [CTRGCN_ntucs_bone_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone_motion.pdparams) |
+
+
+## Inference
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
+ -p data/CTRGCN_ntucs_joint.pdparams \
+ -o inference/CTRGCN
+```
+
+ To get model architecture file `CTRGCN.pdmodel` and parameters file `CTRGCN.pdiparams`, use:
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \
+ --config configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
+ --model_file inference/CTRGCN_joint/CTRGCN_joint.pdmodel \
+ --params_file inference/CTRGCN_joint/CTRGCN_joint.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example_NTU-RGB-D_sketeton.npy
+ top-1 class: 4
+ top-1 score: 0.999988317489624
+```
+
+## Reference
+
+- [Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213), Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming
diff --git a/docs/en/model_zoo/recognition/movinet.md b/docs/en/model_zoo/recognition/movinet.md
new file mode 100644
index 0000000000000000000000000000000000000000..317501938d580cf1cfe435fc0133d80fb6f18481
--- /dev/null
+++ b/docs/en/model_zoo/recognition/movinet.md
@@ -0,0 +1,91 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/movinet.md) | English
+
+# MoViNet
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+Movinet is a mobile video network developed by Google research. It uses causal convolution operator with stream buffer and temporal ensembles to improve accuracy. It is a lightweight and efficient video model that can be used for online reasoning video stream.
+
+
+## Data
+
+Please refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)
+
+
+## Train
+
+- Train MoViNet on kinetics-400 scripts:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_movinet main.py --validate -c configs/recognition/movinet/movinet_k400_frame.yaml
+```
+
+## Test
+
+- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:
+
+```txt
+Already save the best model (top1 acc)0.6489
+```
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/recognition/movinet/movinet_k400_frame.yaml -w output/MoViNet/MoViNet_best.pdparams
+```
+
+
+Accuracy on Kinetics400:
+
+| Config | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :--------: | :-------: | :-------: | :-----: | :-----: |
+| A0 | Uniform | 50 | 172 | 66.62 | [MoViNetA0_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/MoViNetA0_k400.pdparams) |
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `MoViNetA0.pdmodel` and parameters file `MoViNetA0.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/movinet/movinet_k400_frame.yaml \
+ -p data/MoViNetA0_k400.pdparams \
+ -o inference/MoViNetA0
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/movinet/movinet_k400_frame.yaml \
+ --model_file inference/MoViNetA0/MoViNet.pdmodel \
+ --params_file inference/MoViNetA0/MoViNet.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.7667049765586853
+```
+
+## Reference
+
+- [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511)
diff --git a/docs/en/model_zoo/recognition/pp-timesformer.md b/docs/en/model_zoo/recognition/pp-timesformer.md
new file mode 100644
index 0000000000000000000000000000000000000000..9acbc6487e7c83b7f416469450b3373674009728
--- /dev/null
+++ b/docs/en/model_zoo/recognition/pp-timesformer.md
@@ -0,0 +1,156 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/pp-timesformer.md) | English
+
+# TimeSformer Video Classification Model
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+We have improved the [TimeSformer model](./timesformer.md) and obtained a more accurate 2D practical video classification model **PP-TimeSformer**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the table below.
+
+| Version | Top1 |
+| :------ | :----: |
+| Ours ([swa](#refer-anchor-1)+distill+16frame) | 79.44 |
+| Ours ([swa](#refer-anchor-1)+distill) | 78.87 |
+| Ours ([swa](#refer-anchor-1)) | **78.61** |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/timesformer#kinetics-400) | 77.92 |
+
+
+## Data
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [ViT_base_patch16_224_miil_21k.pdparams](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through wget command
+
+ ```bash
+ wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+ ```
+
+2. Open `PaddleVideo/configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+ ```yaml
+ MODEL:
+ framework: "RecognizerTransformer"
+ backbone:
+ name: "VisionTransformer_tweaks"
+ pretrained: fill in the path here
+ ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+ ```bash
+ # videos data format
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --validate -c configs/recognition/ pptimesformer/pptimesformer_k400_videos.yaml
+ ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 # MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+ # videos data format
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --amp --validate -c configs /recognition/pptimesformer/pptimesformer_k400_videos.yaml
+ ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The PP-TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+ ```
+ Already save the best model (top1 acc)0.7258
+ ```
+
+- Because the sampling method of the PP-TimeSformer model test mode is a slightly slower but higher accuracy **UniformCrop**, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log` topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:
+
+ ```bash
+ # 8-frames testing script
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --test -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml -w "output/ppTimeSformer/ppTimeSformer_best.pdparams"
+
+ # 16-frames testing script
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --test \
+ -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+ -o MODEL.backbone.num_seg=16 \
+ -o MODEL.runtime_cfg.test.num_seg=16 \
+ -o PIPELINE.test.decode.num_seg=16 \
+ -o PIPELINE.test.sample.num_seg=16 \
+ -w "data/ppTimeSformer_k400_16f_distill.pdparams"
+ ```
+
+
+ When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+ | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+ | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
+ | Vision Transformer | UniformCrop | 8 | 224 | 78.61 | [ppTimeSformer_k400_8f.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f.pdparams) |
+ | Vision Transformer | UniformCrop | 8 | 224 | 78.87 | [ppTimeSformer_k400_8f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f_distill.pdparams) |
+ | Vision Transformer | UniformCrop | 16 | 224 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |
+
+
+- During the test, the PP-TimeSformer video sampling strategy is to use linspace sampling: in time sequence, from the first frame to the last frame of the video sequence to be sampled, `num_seg` sparse sampling points (including endpoints) are uniformly generated; spatially , Select 3 areas to sample at both ends of the long side and the middle position (left, middle, right or top, middle, and bottom). A total of 1 clip is sampled for 1 video.
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+ -p data/ppTimeSformer_k400_8f.pdparams \
+ -o inference/ppTimeSformer
+```
+
+The above command will generate the model structure file `ppTimeSformer.pdmodel` and the model weight file `ppTimeSformer.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-Model Reasoning)
+
+### Use predictive engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+ --model_file inference/ppTimeSformer/ppTimeSformer.pdmodel \
+ --params_file inference/ppTimeSformer/ppTimeSformer.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9997474551200867
+```
+
+It can be seen that using the ppTimeSformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.
+
+## Reference
+
+- [Is Space-TimeAttention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
+
+
+- [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407v3), Pavel Izmailov, Dmitrii Podoprikhin, Timur Garipov
+- [ImageNet-21K Pretraining for the Masses](https://arxiv.org/pdf/2104.10972v4.pdf), Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy
diff --git a/docs/en/model_zoo/recognition/pp-tsm.md b/docs/en/model_zoo/recognition/pp-tsm.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1ae1aa591b289e1b67518ea8301deff0dbf63a5
--- /dev/null
+++ b/docs/en/model_zoo/recognition/pp-tsm.md
@@ -0,0 +1,167 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/pp-tsm.md) | English
+
+# PP-TSM
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+We optimized TSM model and proposed **PP-TSM** in this repo. Without increasing the number of parameters, the accuracy of TSM was significantly improved in UCF101 and Kinetics-400 datasets. Please refer to [**Tricks on PP-TSM**](https://zhuanlan.zhihu.com/p/382134297) for more details.
+
+| Version | Sampling method | Top1 |
+| :------ | :----------: | :----: |
+| Ours (distill) | Dense | **76.16** |
+| Ours | Dense | 75.69 |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) | Dense | 74.55 |
+| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module) | Dense | 74.1 |
+
+
+| Version | Sampling method | Top1 |
+| :------ | :----------: | :----: |
+| Ours (distill) | Uniform | **75.11** |
+| Ours | Uniform | 74.54 |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) | Uniform | 71.90 |
+| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module) | Uniform | 71.16 |
+
+
+## Data
+
+Please refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)
+
+Please refer to UCF101 data download and preparation doc [ucf101-data](../../dataset/ucf101.md)
+
+
+## Train
+
+### Train on kinetics-400
+
+#### download pretrain-model
+
+Please download [ResNet50_vd_ssld_v2](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as pretraind model:
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
+```
+
+and add path to `MODEL.framework.backbone.pretrained` in config file as:
+
+```yaml
+MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTweaksTSM"
+ pretrained: your weight path
+```
+
+- If use ResNet101 as backbone, please download [ResNet101_vd_ssld_pretrained.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams) as pretraind model.
+
+#### Start training
+
+- Train PP-TSM on kinetics-400 scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+```
+
+- Train PP-TSM on kinetics-400 video data using scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml
+```
+
+- AMP is useful for speeding up training:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --amp --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+```
+
+- Train PP-TSM on kinetics-400 with dense sampling:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml
+```
+
+- Train PP-TSM on kinetics-400 with ResNet101 as backbone using dense sampling:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml
+```
+
+
+## Test
+
+- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:
+
+```txt
+Already save the best model (top1 acc)0.7454
+```
+
+- For dense sampling, test accuracy can be obtained using scripts:
+
+```bash
+python3 main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml -w output/ppTSM/ppTSM_best.pdparams
+```
+
+
+Accuracy on Kinetics400:
+
+| backbone | distill | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :----------: | :----: | :----: | :----: | :----: | :---- |
+| ResNet50 | False | Uniform | 8 | 224 | 74.54 | [ppTSM_k400_uniform.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams) |
+| ResNet50 | False | Dense | 8 | 224 | 75.69 | [ppTSM_k400_dense.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense.pdparams) |
+| ResNet50 | True | Uniform | 8 | 224 | 75.11 | [ppTSM_k400_uniform_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |
+| ResNet50 | True | Dense | 8 | 224 | 76.16 | [ppTSM_k400_dense_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |
+| ResNet101 | True | Uniform | 8 | 224 | 76.35 | [ppTSM_k400_uniform_distill_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_uniform_distill_r101.pdparams) |
+| ResNet101 | False | Dense | 8 | 224 | 77.15 | [ppTSM_k400_dense_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_dense_r101.pdparams) |
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `ppTSM.pdmodel` and parameters file `ppTSM.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+ -p data/ppTSM_k400_uniform.pdparams \
+ -o inference/ppTSM
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+ --model_file inference/ppTSM/ppTSM.pdmodel \
+ --params_file inference/ppTSM/ppTSM.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9907386302947998
+```
+
+we can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.
+
+## Reference
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/docs/en/model_zoo/recognition/pp-tsn.md b/docs/en/model_zoo/recognition/pp-tsn.md
new file mode 100644
index 0000000000000000000000000000000000000000..68d9215fd92fc843ef5200f90bad859284797683
--- /dev/null
+++ b/docs/en/model_zoo/recognition/pp-tsn.md
@@ -0,0 +1,146 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/pp-tsn.md) | English
+
+# PP-TSN
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+We have improved the [TSN model](./tsn.md) and obtained a more accurate 2D practical video classification model **PP-TSN**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the following table.
+
+| Version | Top1 |
+| :------ | :----: |
+| Ours (distill) | 75.06 |
+| Ours | **73.68** |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn#kinetics-400) | 71.80 |
+
+
+## Data
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image distillation pre-training model [ResNet50_vd_ssld_v2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as the Backbone initialization parameter, or download it through wget
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
+ ```
+
+2. Open `PaddleVideo/configs/recognition/pptsn/pptsn_k400_frames.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+ ```yaml
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTweaksTSN"
+ pretrained: fill in the path here
+ ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+ ```bash
+ # frames data format
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_frames.yaml
+
+ # videos data format
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_videos.yaml
+ ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 # MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ # frames data format
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_frames.yaml
+
+ # videos data format
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_videos.yaml
+ ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The PP-TSN model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+ ```
+ Already save the best model (top1 acc)0.7004
+ ```
+
+- Since the sampling method of the PP-TSN model test mode is **TenCrop**, which is slightly slower but more accurate, it is different from the **CenterCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --test -c configs/recognition/ pptsn/pptsn_k400_frames.yaml -w "output/ppTSN/ppTSN_best.pdparams"
+ ```
+
+ When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+ | backbone | Sampling method | distill | num_seg | target_size | Top-1 | checkpoints |
+ | :------: | :-------------: | :-----: | :-----: | :---------: | :---- | :---------------------: |
+ | ResNet50 | TenCrop | False | 3 | 224 | 73.68 | [ppTSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams) |
+ | ResNet50 | TenCrop | True | 8 | 224 | 75.06 | [ppTSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |
+
+- The PP-TSN video sampling strategy is TenCrop sampling: in time sequence, the input video is evenly divided into num_seg segments, and the middle position of each segment is sampled 1 frame; spatially, from the upper left corner, upper right corner, center point, lower left corner, and lower right corner Each of the 5 sub-regions sampled an area of 224x224, and the horizontal flip was added to obtain a total of 10 sampling results. A total of 1 clip is sampled for 1 video.
+
+- Distill is `True`, which means that the pre-trained model obtained by distillation is used. For the specific distillation scheme, please refer to [ppTSM Distillation Scheme]().
+
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_frames.yaml -p data/ppTSN_k400.pdparams -o inference/ppTSN
+```
+
+The above command will generate the model structure file `ppTSN.pdmodel` and model weight files `ppTSN.pdiparams` and `ppTSN.pdiparams.info` files required for prediction, all of which are stored in the `inference/ppTSN/` directory
+
+For the meaning of each parameter in the above bash command, please refer to [Model Reasoning Method](https://github.com/HydrogenSulfate/PaddleVideo/blob/PPTSN-v1/docs/en/start.md#2-infer)
+
+### Use prediction engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/pptsn/pptsn_k400_frames.yaml \
+ --model_file inference/ppTSN/ppTSN.pdmodel \
+ --params_file inference/ppTSN/ppTSN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```bash
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.998979389667511
+```
+
+It can be seen that using the PP-TSN model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.
+
+## Reference
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/docs/en/model_zoo/recognition/slowfast.md b/docs/en/model_zoo/recognition/slowfast.md
new file mode 100644
index 0000000000000000000000000000000000000000..45259f0f7ac34ccd95dd115ab7b0d81dcf2510ed
--- /dev/null
+++ b/docs/en/model_zoo/recognition/slowfast.md
@@ -0,0 +1,120 @@
+[简体中文 ](../../../zh-CN/model_zoo/recognition/slowfast.md) | English
+
+# SlowFast
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+SlowFast involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast path-way, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition.
+
+
+
+SlowFast Overview
+
+
+
+## Data
+
+We use Kinetics-400 to train this model,data preparation please refer to [Kinetics-400 dataset](../../dataset/k400.md).
+
+
+## Train
+
+You can start training by:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast main.py --validate -c configs/recognition/slowfast/slowfast.yaml
+```
+
+- Training would be efficent using our code. The training speed is 2x faster than the original implementation. Details can refer to [benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/en/benchmark.md).
+
+### Speed up training
+
+It's time consuming to train SlowFast model. So we implement [Multigrid training stragety](https://arxiv.org/abs/1912.00998) to speed up training. Training script:
+
+```bash
+python -B -m paddle.distributed.launch --selected_gpus="0,1,2,3,4,5,6,7" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml
+```
+
+Performance evaluation:
+
+| training stragety | time cost of one epoch/min | total training time/min | speed-up |
+| :------ | :-----: | :------: |:------: |
+| Multigrid | 27.25 | 9758 (6.7 days) | 2.89x |
+| Normal | 78.76 | 15438 (10.7days) | base |
+
+For more details, please refer to [accelerate doc](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/tutorials/accelerate.md#%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5%E5%8A%A0%E9%80%9F).
+
+
+## Test
+
+You can start testing by:
+
+```bash
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast_test main.py --test -c configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams
+```
+
+- Args `-w` is used to specifiy the model path,you can download our model in [SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams).
+
+
+Test accuracy in Kinetics-400:
+
+| Configs | Acc1 | Acc5 | Weights |
+| :---: | :---: | :---: | :---: |
+| [slowfast.yaml](../../../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 91.33 | [slowfast_4x16.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams) |
+| [slowfast_multigrid.yaml](../../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | 75.84 | 92.33 | [slowfast_8x8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |
+
+- Acc1 may be lower than that released in papaer, as ~5% data of kinetics-400 is missing. Experiments have verified that if training with the same data, we can get the same accuracy.
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `SlowFast.pdmodel` and parameters file `SlowFast.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml \
+ -p data/SlowFast.pdparams \
+ -o inference/SlowFast
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/slowfast/slowfast.yaml \
+ --model_file inference/SlowFast/SlowFast.pdmodel \
+ --params_file inference/SlowFast/SlowFast.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 1.0
+```
+
+we can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.
+
+
+## Reference
+
+- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.
diff --git a/docs/en/model_zoo/recognition/stgcn.md b/docs/en/model_zoo/recognition/stgcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..14585e5e4d491394fc0ab11692e9631ccfec9a82
--- /dev/null
+++ b/docs/en/model_zoo/recognition/stgcn.md
@@ -0,0 +1,129 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/stgcn.md) | English
+
+# ST-GCN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+ST-GCN is skeleton-based action recognition model proposed in AAAI 2018.
+
+
+
+
+
+
+## Data
+
+Please refer to FSD data download and preparation doc [FSD](../../dataset/fsd.md)
+
+Please refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)
+
+
+## Train
+
+### Train on FSD
+
+- Train ST-GCN on FSD scripts:
+
+```bash
+python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml
+```
+
+- Turn off `valid` when training, as validation dataset is not available for the competition.
+
+### Train on NTU-RGBD
+
+- Train ST-GCN on NTU-RGBD scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_stgcn main.py --validate -c configs/recognition/stgcn/stgcn_ntucs.yaml
+```
+
+- config file `stgcn_ntucs.yaml` corresponding to the config of ST-GCN on NTU-RGB+D dataset with cross-subject splits.
+
+
+## Test
+
+### Test on FSD
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -w output/STGCN/STGCN_epoch_00090.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).
+
+Accuracy on FSD-10 dataset:
+
+Test_Data| Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| Test_A | 59.07 | [STGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |
+
+
+### Test on NTU-RGB+D
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/recognition/stgcn/stgcn_ntucs.yaml -w output/STGCN/STGCN_best.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+
+Accuracy on NTU-RGB+D dataset:
+
+| split | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| cross-subject | 82.28 | [STGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_ntucs.pdparams) |
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `STGCN.pdmodel` and parameters file `STGCN.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml \
+ -p data/STGCN_fsd.pdparams \
+ -o inference/STGCN
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
+ --config configs/recognition/stgcn/stgcn_fsd.yaml \
+ --model_file inference/STGCN/STGCN.pdmodel \
+ --params_file inference/STGCN/STGCN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.9912770986557007
+```
+
+## Reference
+
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
diff --git a/docs/en/model_zoo/recognition/timesformer.md b/docs/en/model_zoo/recognition/timesformer.md
new file mode 100644
index 0000000000000000000000000000000000000000..c004b9b0457b6e8006c7561bde80795d42dd4ed9
--- /dev/null
+++ b/docs/en/model_zoo/recognition/timesformer.md
@@ -0,0 +1,137 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/timesformer.md) | English
+
+# TimeSformer
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#DATA)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+TimeSformer is a video classification model based on vision transformer, which has the characteristics of no convolution, global receptive field, and strong time series modeling ability. At present, it has achieved SOTA accuracy on the Kinetics-400 data set, surpassing the classic CNN-based video classification models TSN, TSM and Slowfast, and has a shorter training time (the Kinetics-400 data set training time is 39 hourss). **This code implements the time-space separated attention cascade network in the paper**.
+
+
+


+
+
+
+## Data
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through the wget command
+
+ ```bash
+ wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+ ```
+
+2. Open `PaddleVideo/configs/recognition/timesformer/timesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+ ```yaml
+ MODEL:
+ framework: "RecognizerTransformer"
+ backbone:
+ name: "VisionTransformer"
+ pretrained: fill in the path here
+ ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+```bash
+# videos data format
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml
+```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 # MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+# videos data format
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --amp --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml
+```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+ ```
+ Already save the best model (top1 acc)0.7258
+ ```
+
+- Since the sampling method of the TimeSformer model test mode is **UniformCrop** with a slower speed but higher accuracy, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --test -c configs/recognition/ timesformer/timesformer_k400_videos.yaml -w "output/TimeSformer/TimeSformer_best.pdparams"
+ ```
+
+
+ When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+
+ | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+ | :----------------: | :-----: | :-----: | :---------: | :----: | :----------------------------------------------------------: |
+ | Vision Transformer | UniformCrop | 8 | 224 | 77.29 | [TimeSformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |
+
+
+- During the test, the TimeSformer video sampling strategy is to use Linspace sampling: in time sequence, num_seg sparse sampling points are uniformly generated from the video sequence to be sampled; in space, select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions are sampled. A total of 1 clip is sampled for 1 video.
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml \
+ -p data/TimeSformer_k400.pdparams \
+ -o inference/TimeSformer
+```
+
+The above command will generate the model structure file `TimeSformer.pdmodel` and the model weight file `TimeSformer.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-infer)
+
+### Use prediction engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/timesformer/timesformer_k400_videos.yaml \
+ --model_file inference/TimeSformer/TimeSformer.pdmodel \
+ --params_file inference/TimeSformer/TimeSformer.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9999722242355347
+```
+
+It can be seen that using the TimeSformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be seen that the predicted category name is `archery`.
+
+## Reference
+
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
diff --git a/docs/en/model_zoo/recognition/tsm.md b/docs/en/model_zoo/recognition/tsm.md
new file mode 100644
index 0000000000000000000000000000000000000000..e44ea6be5d976f0437971add6550e8c0cac68de0
--- /dev/null
+++ b/docs/en/model_zoo/recognition/tsm.md
@@ -0,0 +1,221 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/tsm.md) | English
+
+# TSM
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Details](#Details)
+- [Reference](#Reference)
+
+## Introduction
+
+Temporal Shift Module (TSM) is a popular model that attracts more attention at present.
+The method of moving through channels greatly improves the utilization ability of temporal information without increasing any
+additional number of parameters and calculation amount.
+Moreover, due to its lightweight and efficient characteristics, it is very suitable for industrial landing.
+
+
+
+
+
+
+This code implemented **single RGB stream** of TSM networks. Backbone is ResNet-50.
+
+Please refer to the ICCV 2019 paper for details [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf)
+
+## Data
+
+Please refer to Kinetics-400 data download and preparation [k400 data preparation](../../dataset/k400.md)
+
+Please refer to UCF101 data download and preparation [ucf101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Train on the Kinetics-400 dataset
+
+#### download pretrain-model
+
+1. Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+ ```
+
+2. Open `PaddleVideo/configs/recognition/tsm/tsm_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`
+
+ ```bash
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTSM"
+ pretrained: your weight path
+ ```
+
+#### Start training
+
+- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`).
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+ ```
+
+- Training Kinetics-400 dataset of videos format using scripts.
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_k400_videos.yaml
+ ```
+
+- AMP is useful for speeding up training, scripts as follows:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+```
+
+- AMP works better with `NHWC` data format, scripts as follows:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml
+```
+
+- For the config file usage,please refer to [config](../../tutorials/config.md).
+
+### Train on UCF-101 dataset
+
+#### download pretrain-model
+
+- Load the TSM model we trained on Kinetics-400 [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams), or download it through the command line
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams
+ ```
+
+- Open `PaddleVideo/configs/recognition/tsm/tsm_ucf101_frames.yaml`, and fill in the downloaded weight path below `pretrained:`
+
+ ```bash
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTSM"
+ pretrained: your weight path
+ ```
+
+#### Start training
+
+- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`).
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
+ ```
+
+- Training UCF-101 dataset of videos format using scripts.
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_ucf101_videos.yaml
+ ```
+
+- AMP is useful for speeding up training, scripts as follows:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 #MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
+ ```
+
+- AMP works better with `NHWC` data format, scripts as follows:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 #MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames_nhwc.yaml
+ ```
+
+## Test
+
+Put the weight of the model to be tested into the `output/TSM/` directory, the test command is as follows
+
+```bash
+python3 main.py --test -c configs/recognition/tsm/tsm.yaml -w output/TSM/TSM_best.pdparams
+```
+
+---
+
+When the test configuration uses the following parameters, the evaluation accuracy on the validation data set of Kinetics-400 is as follows:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :--------: | :---------------: | :-------: | :-----------: | :-----: | :-----------: | :-----------: |
+| ResNet50 | Uniform | NCHW | 8 | 224 | 71.06 | [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams) |
+
+When the test configuration uses the following parameters, the evaluation accuracy on the validation data set of UCF-101 is as follows:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :-------------: | :-----------------: | :-----: | :---------: | :---: | :---------: |
+| ResNet50 | Uniform | NCHW | 8 | 224 | 94.42 | [TSM_ucf101_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_nchw.pdparams) |
+| ResNet50 | Uniform | NCHW+AMP | 8 | 224 | 94.40 | [TSM_ucf101_amp_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nchw.pdparams) |
+| ResNet50 | Uniform | NHWC+AMP | 8 | 224 | 94.55 | [TSM_ucf101_amp_nhwc.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nhwc.pdparams) |
+
+## Inference
+
+### export inference model
+
+To get model architecture file `TSM.pdmodel` and parameters file `TSM.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml \
+ -p data/TSM_k400.pdparams \
+ -o inference/TSM
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/tsm/tsm_k400_frames.yaml \
+ --model_file inference/TSM/TSM.pdmodel \
+ --params_file inference/TSM/TSM.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+## Implementation details
+
+### data processing
+
+- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then uniformly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames. Then do the same random data enhancement to this `num_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`.
+
+### Training strategy
+
+* Use Momentum optimization algorithm training, momentum=0.9
+* Using L2_Decay, the weight attenuation coefficient is 1e-4
+* Using global gradient clipping, the clipping factor is 20.0
+* The total number of epochs is 50, and the learning rate will be attenuated by 0.1 times when the epoch reaches 20 and 40
+* The learning rate of the weight and bias of the FC layer are respectively 5 times and 10 times the overall learning rate, and the bias does not set L2_Decay
+* Dropout_ratio=0.5
+
+### Parameter initialization
+
+- Initialize the weight of the FC layer with the normal distribution of Normal(mean=0, std=0.001), and initialize the bias of the FC layer with a constant of 0
+
+
+## Reference
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
diff --git a/docs/en/model_zoo/recognition/tsn.md b/docs/en/model_zoo/recognition/tsn.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc157445939b29665e44caf8b64c7cc7da41c968
--- /dev/null
+++ b/docs/en/model_zoo/recognition/tsn.md
@@ -0,0 +1,123 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/tsn.md) | English
+
+# TSN
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Details](#Details)
+- [Reference](#Reference)
+
+## Introduction
+
+Temporal Segment Network (TSN) is a classic 2D-CNN-based solution in the field of video classification. This method mainly solves the problem of long-term behavior recognition of video, and replaces dense sampling by sparsely sampling video frames, which can not only capture the global information of the video, but also remove redundancy and reduce the amount of calculation. The core idea is to average the features of each frame as the overall feature of the video, and then enter the classifier for classification. The model implemented by this code is a TSN network based on a single-channel RGB image, and Backbone uses the ResNet-50 structure.
+
+
+
+
+
+
+For details, please refer to the ECCV 2016 paper [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
+
+## Data
+
+PaddleVide provides training and testing scripts on the Kinetics-400 dataset. Kinetics-400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Load the ResNet50 weights trained on ImageNet1000 as Backbone initialization parameters [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams), or download through the command line
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+ ```
+
+2. Open `PaddleVideo/configs/recognition/tsn/tsn_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`
+
+ ```yaml
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNet"
+ pretrained: fill in the path here
+ ```
+
+#### Start training
+
+- Kinetics-400 data set uses 8 cards for training, the training start command for frames format data is as follows
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py --validate -c configs/recognition/ tsn/tsn_k400_frames.yaml
+ ```
+
+## Test
+
+Since the sampling method of the TSN model test mode is **TenCrop** with a slower speed but higher accuracy, which is different from the **CenterCrop** used in the verification mode during the training process, the verification index `topk Acc` recorded in the training log It does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py --test -c configs/recognition/ tsn/tsn_k400_frames.yaml -w "output/TSN/TSN_best.pdparams"
+```
+
+When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :-------------: | :---------------: | :-----: | :---------: | :---: | :----------------------------------------------------------: |
+| ResNet50 | TenCrop | NCHW | 3 | 224 | 69.81 | [TSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |
+| ResNet50 | TenCrop | NCHW | 8 | 224 | 71.70 | [TSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400_8.pdparams) |
+## Inference
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml \
+ -p data/TSN_k400.pdparams \
+ -o inference/TSN
+```
+
+The above command will generate the model structure file `TSN.pdmodel` and the model weight file `TSN.pdiparams` required for prediction.
+
+For the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/tsn/tsn_k400_frames.yaml \
+ --model_file inference/TSN/TSN.pdmodel \
+ --params_file inference/TSN/TSN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+## Details
+
+**data processing:**
+
+- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then evenly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames , And then do the same random data enhancement to this `num_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`
+
+**training strategy:**
+
+- Use Momentum optimization algorithm for training, momentum=0.9
+
+- Using L2_Decay, the weight attenuation coefficient is 1e-4
+
+- Use global gradient clipping, with a clipping factor of 40.0
+
+- The total number of epochs is 100, and the learning rate will be attenuated by 0.1 times when the epoch reaches 40 and 80
+
+- Dropout_ratio=0.4
+
+**parameter initialization**
+
+- The convolutional layer of the TSN model uses Paddle's default [KaimingNormal](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/KaimingNormal_cn.html#kaimingnormal) and [Constant](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/Constant_cn.html#constant) initialization method, with Normal(mean=0, std= 0.01) normal distribution to initialize the weight of the FC layer, and a constant 0 to initialize the bias of the FC layer
+
+## Reference
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
diff --git a/docs/en/model_zoo/recognition/tsn_dali.md b/docs/en/model_zoo/recognition/tsn_dali.md
new file mode 100644
index 0000000000000000000000000000000000000000..affaf0ad5f7effae429e851f4740b953f09c4996
--- /dev/null
+++ b/docs/en/model_zoo/recognition/tsn_dali.md
@@ -0,0 +1,98 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/tsn_dali.md) | English
+
+# TSN DALI
+
+- [Introduction](#Introduction)
+- [Requirement](#Requirement)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+We aims to speed up TSN model training using DALI in this code. As [nvidia DALI](https://github.com/NVIDIA/DALI) not support TSN sampling way, we reimplemented segment sampling in VideoReader.
+
+### Performance
+
+Test Environment:
+```
+Card: Tesla v100
+Memory: 4 * 16G
+Cuda: 9.0
+batch_size of single card: 32
+```
+
+| Training way | batch cost/s | reader cost/s | ips:instance/sec | Speed up |
+| :--------------- | :--------: | :------------: | :------------: | :------------: |
+| DALI | 2.083 | 1.804 | 15.36597 | 1.41x |
+| Dataloader: num_workers=4 | 2.943 | 2.649 | 10.87460| base |
+| pytorch实现 | TODO | TODO | TODO | TODO |
+
+
+## Requirement
+
+docker image:
+
+```
+ huangjun12/paddlevideo:tsn_dali_cuda9_0
+```
+
+To build container, you can use:
+
+```bash
+nvidia-docker run --name tsn-DALI -v /home:/workspace --network=host -it --shm-size 64g -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video huangjun12/paddlevideo:tsn_dali_cuda9_0 /bin/bash
+```
+
+## Data
+
+- Kinetics400 dataset please refer to [K400 data](../../dataset/k400.md)
+
+- UCF101 dataset please refer to [UCF101 data](../../dataset/ucf101.md)
+
+## Train
+
+### download pretrain-model
+
+- Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+```
+
+and add path to MODEL.framework.backbone.pretrained in config file as:
+
+```yaml
+MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNet"
+ pretrained: your weight path
+```
+
+### Start training
+
+You can start training by:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml -o log_level="INFO"
+```
+
+- Args -c is used to specify config file,default is ```configs/recognition/tsn/tsn_dali.yaml```。
+
+- For finetune please download our trained model [TSN.pdparams]()coming soon,and specify file path with --weights.
+
+- For the config file usage,please refer to [config](../../tutorials/config.md).
+
+## Test
+
+Please refer to [TSN Test](./tsn.md)
+
+## Inference
+
+Please refer to [TSN Inference](./tsn.md)
+
+## Reference
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
diff --git a/docs/en/model_zoo/recognition/videoswin.md b/docs/en/model_zoo/recognition/videoswin.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ba3ab645d178f4690953d145b8a44201cb30bba
--- /dev/null
+++ b/docs/en/model_zoo/recognition/videoswin.md
@@ -0,0 +1,130 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/videoswin.md) | English
+
+# Video-Swin-Transformer Video Classification Model
+
+## content
+
+- [Introduction](#Introduction)
+- [Data](#DATA)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+Video-Swin-Transformer is a video classification model based on Swin Transformer. It utilizes Swin Transformer's multi-scale modeling and efficient local attention characteristics. It currently achieves SOTA accuracy on the Kinetics-400 data set, surpassing the same transformer structure. The TimeSformer model.
+
+
+
+
+## DATA
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [SwinTransformer_imagenet.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SwinTransformer_imagenet.pdparams) as the Backbone initialization parameter, or download it through the wget command
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SwinTransformer_imagenet.pdparams
+ ```
+
+2. Open `configs/recognition/videoswin/videoswin_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+ ```yaml
+ MODEL:
+ framework: "RecognizerTransformer"
+ backbone:
+ name: "SwinTransformer3D"
+ pretrained: fill in the path here
+ ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+ ```bash
+ # videos data format
+ python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin main.py --validate -c configs/ recognition/video_swin_transformer/videoswin_k400_videos.yaml
+ ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 # MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+ # videos data format
+ python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin main.py --amp --validate- c configs/recognition/videoswin/videoswin_k400_videos.yaml
+ ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The Video-Swin-Transformer model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+ ```
+ Already save the best model (top1 acc)0.7258
+ ```
+
+- Since the sampling method of the Video-Swin-Transformer model test mode is a bit slower but more accurate **UniformCrop**, which is different from the **CenterCrop** used in the verification mode during the training process, so the verification recorded in the training log The index `topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin main.py --test -c configs/recognition/ video_swin_transformer/videoswin_k400_videos.yaml -w "output/VideoSwin/VideoSwin_best.pdparams"
+ ```
+
+
+ When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+ | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+ | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
+ | Swin Transformer | UniformCrop | 32 | 224 | 82.40 | [SwinTransformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams) |
+
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/videoswin/videoswin_k400_videos.yaml \
+ -p data/VideoSwin_k400.pdparams \
+ -o inference/VideoSwin
+```
+
+The above command will generate the model structure file `VideoSwin.pdmodel` and the model weight file `VideoSwin.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-Model Reasoning)
+
+### Use predictive engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/videoswin/videoswin_k400_videos.yaml \
+ --model_file inference/VideoSwin/VideoSwin.pdmodel \
+ --params_file inference/VideoSwin/VideoSwin.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9999829530715942
+```
+
+It can be seen that using the Video-Swin-Transformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.
+
+## Reference
+
+- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei
diff --git a/docs/en/model_zoo/segmentation/asrf.md b/docs/en/model_zoo/segmentation/asrf.md
new file mode 100644
index 0000000000000000000000000000000000000000..18f7d016a1dcabc18c3de50a833be5693cbfd2fd
--- /dev/null
+++ b/docs/en/model_zoo/segmentation/asrf.md
@@ -0,0 +1,139 @@
+[简体中文](../../../zh-CN/model_zoo/segmentation/asrf.md) | English
+
+# ASRF : Video Action Segmentation Model
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+ASRF model is an improvement on the video motion segmentation model ms-tcn, which was published on WACV in 2021. We reproduce the officially implemented pytorch code and obtain approximate results in paddlevideo.
+
+
+
+MS-TCN Overview
+
+
+## Data
+
+ASRF can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)
+
+Unlike MS-TCN, ASRF model requires additional data construction. The script process is as follows
+```bash
+python data/50salads/prepare_asrf_data.py --dataset_dir data/
+```
+
+## Train
+
+After prepare dataset, we can run sprits.
+
+```bash
+# gtea dataset
+export CUDA_VISIBLE_DEVICES=3
+python3.7 main.py --validate -c configs/segmentation/asrf/asrf_gtea.yaml
+```
+
+- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.
+
+## Test
+
+Test MS-TCN on dataset scripts:
+
+```bash
+python main.py --test -c configs/segmentation/asrf/asrf_gtea.yaml --weights=./output/ASRF/ASRF_split_1.pdparams
+```
+
+- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.
+
+The reproduction of pytorch comes from the official [code base](https://github.com/yiskw713/asrf)
+
+- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.
+
+Accuracy on Breakfast dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 67.6% | 72.4% | 74.3% | 68.9% | 56.1% |
+| pytorch | 65.8% | 71.0% | 72.3% | 66.5% | 54.9% |
+| paddle | 66.1% | 71.9% | 73.3% | 67.9% | 55.7% |
+
+Accuracy on 50salads dataset(5 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 84.5% | 79.3% | 82.9% | 83.5% | 77.3% |
+| pytorch | 81.4% | 75.6% | 82.7% | 81.2% | 77.2% |
+| paddle | 81.6% | 75.8% | 83.0% | 81.5% | 74.8% |
+
+Accuracy on gtea dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 77.3% | 83.7% | 89.4% | 87.8% | 79.8% |
+| pytorch | 76.3% | 79.6% | 87.3% | 85.8% | 74.9% |
+| paddle | 77.1% | 83.3% | 88.9% | 87.5% | 79.1% |
+
+Model weight for gtea
+Test_Data| F1@0.5 | checkpoints |
+| :----: | :----: | :---- |
+| gtea_split1 | 72.4409 | [ASRF_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_1.pdparams) |
+| gtea_split2 | 76.6666 | [ASRF_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_2.pdparams) |
+| gtea_split3 | 84.5528 | [ASRF_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_3.pdparams) |
+| gtea_split4 | 82.6771 | [ASRF_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_4.pdparams) |
+## Infer
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/segmentation/asrf/asrf_gtea.yaml \
+ -p data/ASRF_gtea_split_1.pdparams \
+ -o inference/ASRF
+```
+
+To get model architecture file `ASRF.pdmodel` and parameters file `ASRF.pdiparams`, use:
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+Input file are the file list for infering, for example:
+```
+S1_Cheese_C1.npy
+S1_CofHoney_C1.npy
+S1_Coffee_C1.npy
+S1_Hotdog_C1.npy
+...
+```
+
+```bash
+python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
+ --config configs/segmentation/asrf/asrf_gtea.yaml \
+ --model_file inference/ASRF/ASRF.pdmodel \
+ --params_file inference/ASRF/ASRF.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```bash
+result write in : ./inference/infer_results/S1_Cheese_C1.txt
+result write in : ./inference/infer_results/S1_CofHoney_C1.txt
+result write in : ./inference/infer_results/S1_Coffee_C1.txt
+result write in : ./inference/infer_results/S1_Hotdog_C1.txt
+result write in : ./inference/infer_results/S1_Pealate_C1.txt
+result write in : ./inference/infer_results/S1_Peanut_C1.txt
+result write in : ./inference/infer_results/S1_Tea_C1.txt
+```
+
+
+## Reference
+
+- [Alleviating Over-segmentation Errors by Detecting Action Boundaries](https://arxiv.org/pdf/2007.06866v1.pdf), Yuchi Ishikawa, Seito Kasai, Yoshimitsu Aoki, Hirokatsu Kataoka
diff --git a/docs/en/model_zoo/segmentation/cfbi.md b/docs/en/model_zoo/segmentation/cfbi.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b5eefe229290b6e6718067687f7bdc41ae862d7
--- /dev/null
+++ b/docs/en/model_zoo/segmentation/cfbi.md
@@ -0,0 +1,46 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/cfbi.md) | English
+
+# CFBI
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Test](#Test)
+- [Reference](#Reference)
+
+## Introduction
+
+CFBI is a Video Object Segmentation model proposed by Baidu in ECCV 2020. This method consider background should be equally treated and thus propose Collaborative video object segmentation by Foreground-Background Integration (CFBI) approach. Our CFBI implicitly imposes the feature embedding from the target foreground object and its corresponding background to be contrastive, promoting the segmentation results accordingly. Given the image and target segmentation of the reference frame (the first frame) and the previous frame, the model will predict the segmentation of the current frame.
+
+
+
+
+
+
+## Data
+
+Please refer to DAVIS data download and preparation doc [DAVIS-data](../../dataset/davis.md)
+
+
+## Test
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/segmentation/cfbip_davis.yaml -w CFBIp_davis.pdparams
+```
+
+- Predicted results will be saved in `result_root`. To get evaluation metrics, please use [davis2017-evaluation tools](https://github.com/davisvideochallenge/davis2017-evaluation).
+
+Metrics on DAVIS:
+
+| J&F-Mean | J-Mean | J-Recall | J-Decay | F-Mean | F-Recall | F-Decay | checkpoints |
+| :------: | :-----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| 0.823 | 0.793 | 0.885 | 0.083 | 0.852 | 0.932 | 0.100 | [CFBIp_r101_davis.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/CFBIp_r101_davis.pdparams) |
+
+
+## Reference
+
+- [Collaborative Video Object Segmentation by Foreground-Background Integration](https://arxiv.org/abs/2003.08333), Zongxin Yang, Yunchao Wei, Yi Yang
diff --git a/docs/en/model_zoo/segmentation/mstcn.md b/docs/en/model_zoo/segmentation/mstcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c619f2b79bb0eebac734bd9d8e4a6b5a1cfa02b
--- /dev/null
+++ b/docs/en/model_zoo/segmentation/mstcn.md
@@ -0,0 +1,130 @@
+[简体中文](../../../zh-CN/model_zoo/segmentation/mstcn.md) | English
+
+# MS-TCN : Video Action Segmentation Model
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+Ms-tcn model is a classic model of video motion segmentation model, which was published on CVPR in 2019. We optimized the officially implemented pytorch code and obtained higher precision results in paddlevideo.
+
+
+
+MS-TCN Overview
+
+
+## Data
+
+MS-TCN can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)
+
+## Train
+
+After prepare dataset, we can run sprits.
+
+```bash
+# gtea dataset
+export CUDA_VISIBLE_DEVICES=3
+python3.7 main.py --validate -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --seed 1538574472
+```
+
+- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.
+
+## Test
+
+Test MS-TCN on dataset scripts:
+
+```bash
+python main.py --test -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --weights=./output/MSTCN/MSTCN_split_1.pdparams
+```
+
+- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.
+
+- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.
+
+Accuracy on Breakfast dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 66.3% | 61.7% | 48.1% | 48.1% | 37.9% |
+| paddle | 65.2% | 61.5% | 53.7% | 49.2% | 38.8% |
+
+Accuracy on 50salads dataset(5 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 80.7% | 67.9% | 76.3% | 74.0% | 64.5% |
+| paddle | 81.1% | 71.5% | 77.9% | 75.5% | 66.5% |
+
+Accuracy on gtea dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 79.2% | 81.4% | 87.5% | 85.4% | 74.6% |
+| paddle | 76.9% | 81.8% | 86.4% | 84.7% | 74.8% |
+
+Model weight for gtea
+
+Test_Data| F1@0.5 | checkpoints |
+| :----: | :----: | :---- |
+| gtea_split1 | 70.2509 | [MSTCN_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_1.pdparams) |
+| gtea_split2 | 70.7224 | [MSTCN_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_2.pdparams) |
+| gtea_split3 | 80.0 | [MSTCN_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_3.pdparams) |
+| gtea_split4 | 78.1609 | [MSTCN_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_4.pdparams) |
+
+## Infer
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
+ -p data/MSTCN_gtea_split_1.pdparams \
+ -o inference/MSTCN
+```
+
+To get model architecture file `MSTCN.pdmodel` and parameters file `MSTCN.pdiparams`, use:
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+Input file are the file list for infering, for example:
+```
+S1_Cheese_C1.npy
+S1_CofHoney_C1.npy
+S1_Coffee_C1.npy
+S1_Hotdog_C1.npy
+...
+```
+
+```bash
+python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
+ --config configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
+ --model_file inference/MSTCN/MSTCN.pdmodel \
+ --params_file inference/MSTCN/MSTCN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+example of logs:
+
+```bash
+result write in : ./inference/infer_results/S1_Cheese_C1.txt
+result write in : ./inference/infer_results/S1_CofHoney_C1.txt
+result write in : ./inference/infer_results/S1_Coffee_C1.txt
+result write in : ./inference/infer_results/S1_Hotdog_C1.txt
+result write in : ./inference/infer_results/S1_Pealate_C1.txt
+result write in : ./inference/infer_results/S1_Peanut_C1.txt
+result write in : ./inference/infer_results/S1_Tea_C1.txt
+```
+
+## Reference
+
+- [MS-TCN: Multi-Stage Temporal Convolutional Network for Action Segmentation](https://arxiv.org/pdf/1903.01945.pdf), Y. Abu Farha and J. Gall.
diff --git a/docs/en/tools.md b/docs/en/tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..56138e5f298100c81b016a27acc3e8b7aadec3db
--- /dev/null
+++ b/docs/en/tools.md
@@ -0,0 +1,22 @@
+[简体中文](../zh-CN/tools.md) | English
+
+# Tools
+
+This page includes the usage of some useful tools in PaddleVideo.
+
+## Params
+
+To get the params of a model.
+
+```shell
+python3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml
+```
+
+## FLOPS
+to print FLOPs.
+
+```shell
+python3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml --FLOPs
+```
+
+## Test the export model coming soon
diff --git a/docs/en/tutorials/Action Recognition Datasets b/docs/en/tutorials/Action Recognition Datasets
new file mode 100644
index 0000000000000000000000000000000000000000..9bd259157bb69fb27539744650596f4588ea03b7
--- /dev/null
+++ b/docs/en/tutorials/Action Recognition Datasets
@@ -0,0 +1,12 @@
+Usefull Action Recognition Datasets.
+
+ AVA, https://arxiv.org/abs/1705.08421
+ Kinetics, https://arxiv.org/abs/1705.06950
+ YouTube-8M, https://arxiv.org/abs/1609.08675
+ ActivityNet, http://www.cv-foundation.org/openaccess/content_cvpr_2015/html/Heilbron_ActivityNet_A_Large-Scale_2015_CVPR_paper.html
+ Moments in Time, https://arxiv.org/pdf/1801.03150.pdf
+ Charades, https://arxiv.org/abs/1604.01753
+ EPIC-Kitchens, https://arxiv.org/abs/1804.02748
+ THUMOS, https://arxiv.org/abs/1604.06182
+ UCF-101, http://crcv.ucf.edu/papers/UCF101_CRCV-TR-12-01.pdf
+ HMDB51, http://serre-lab.clps.brown.edu/wp-content/uploads/2012/08/Kuehne_etal_iccv11.pdf
diff --git a/docs/en/tutorials/Action Recognition Papers b/docs/en/tutorials/Action Recognition Papers
new file mode 100644
index 0000000000000000000000000000000000000000..7282bef96ecf42e964da4b5e82356a02b4b7992a
--- /dev/null
+++ b/docs/en/tutorials/Action Recognition Papers
@@ -0,0 +1,31 @@
+Useful Papers on Action Recognition and Video Classification.
+
+TSN: Temporal Segment Networks: Towards Good Practices for Deep Action Recognition, ECCV 2016
+TSM: Temporal Shift Module for Efficient Video Understanding, ICCV 2019
+SlowFast Networks for Video Recognition, ICCV 2019
+Non-local Neural Networks, CVPR 2018
+A Multigrid Method for Efficiently Training Video Models, CVPR2020
+X3D: Progressive Network Expansion for Efficient Video Recognition, CVPR2020
+ECO: Efficient Convolutional Network for Online Video Understanding, ECCV 2018
+3D Resnet: Would Mega-scale Datasets Further Enhance Spatiotemporal 3D CNNs, CVPR 2018
+TPN: Temporal Pyramid Network for Action Recognition, CVPR 2020
+EvaNet: Evolving Space-Time Neural Architectures for Videos, ICCV 2019
+RepFlow: Representation Flow for Action Recognition, CVPR 2019
+MARS: Motion-Augmented RGB Stream for Action Recognition, CVPR 2019
+StNet: Local and Global Spatial-Temporal Modeling for Human Action Recognition, AAAI 2019
+Attention Cluster: Purely Attention Based Local Feature Integration for Video Classification
+NeXtVLAD: An Efficient Neural Network to Aggregate Frame-level Features for Large-scale Video Classification
+C-TCN: Action localization Model by Baidu, the Champion model of ActivityNet 2018
+Neural Graph Matching Networks for Fewshot 3D Action Recognition - M. Guo et al., ECCV2018.
+Temporal 3D ConvNets using Temporal Transition Layer - A. Diba et al., CVPRW2018.
+Temporal 3D ConvNets: New Architecture and Transfer Learning for Video Classification - A. Diba et al., arXiv2017.
+Attentional Pooling for Action Recognition - R. Girdhar and D. Ramanan, NIPS2017.
+Fully Context-Aware Video Prediction - Byeon et al, arXiv2017.
+Hidden Two-Stream Convolutional Networks for Action Recognition - Y. Zhu et al, arXiv2017.
+Dynamic Image Networks for Action Recognition - H. Bilen et al, CVPR2016.
+Long-term Recurrent Convolutional Networks for Visual Recognition and Description - J. Donahue et al, CVPR2015.
+Describing Videos by Exploiting Temporal Structure - L. Yao et al, ICCV2015.
+Real-time Action Recognition with Enhanced Motion Vector CNNs - B. Zhang et al, CVPR2016.
+Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors - L. Wang et al, CVPR2015.
+
+
diff --git a/docs/en/tutorials/Spatio-Temporal Action Detection Papers b/docs/en/tutorials/Spatio-Temporal Action Detection Papers
new file mode 100644
index 0000000000000000000000000000000000000000..f466849f6aa96454b7ad9c7bdf42102c817ecc4a
--- /dev/null
+++ b/docs/en/tutorials/Spatio-Temporal Action Detection Papers
@@ -0,0 +1,30 @@
+Usefull Spatio-Temporal Action Detection Papers.
+
+
+
+ A Better Baseline for AVA - R. Girdhar et al., ActivityNet Workshop, CVPR2018.
+ Real-Time End-to-End Action Detection with Two-Stream Networks - A. El-Nouby and G. Taylor, arXiv2018.
+ Human Action Localization with Sparse Spatial Supervision - P. Weinzaepfel et al., arXiv2017.
+ Unsupervised Action Discovery and Localization in Videos - K. Soomro and M. Shah, ICCV2017.
+ Spatial-Aware Object Embeddings for Zero-Shot Localization and Classification of Actions - P. Mettes and C. G. M. Snoek, ICCV2017.
+ Action Tubelet Detector for Spatio-Temporal Action Localization - V. Kalogeiton et al, ICCV2017.
+ Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos - R. Hou et al, ICCV2017.
+ Chained Multi-stream Networks Exploiting Pose, Motion, and Appearance for Action Classification and Detection - M. Zolfaghari et al, ICCV2017.
+ TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal - H. Zhu et al., ICCV2017.
+ Online Real time Multiple Spatiotemporal Action Localisation and Prediction - G. Singh et al, ICCV2017.
+ AMTnet: Action-Micro-Tube regression by end-to-end trainable deep architecture - S. Saha et al, ICCV2017.
+ Am I Done? Predicting Action Progress in Videos - F. Becattini et al, BMVC2017.
+ Generic Tubelet Proposals for Action Localization - J. He et al, arXiv2017.
+ Incremental Tube Construction for Human Action Detection - H. S. Behl et al, arXiv2017.
+ Multi-region two-stream R-CNN for action detection - X. Peng and C. Schmid. ECCV2016.
+ Spot On: Action Localization from Pointly-Supervised Proposals - P. Mettes et al, ECCV2016.
+ Deep Learning for Detecting Multiple Space-Time Action Tubes in Videos - S. Saha et al, BMVC2016.
+ Learning to track for spatio-temporal action localization - P. Weinzaepfel et al. ICCV2015.
+ Action detection by implicit intentional motion clustering - W. Chen and J. Corso, ICCV2015.
+ Finding Action Tubes - G. Gkioxari and J. Malik CVPR2015.
+ APT: Action localization proposals from dense trajectories - J. Gemert et al, BMVC2015.
+ Spatio-Temporal Object Detection Proposals - D. Oneata et al, ECCV2014.
+ Action localization with tubelets from motion - M. Jain et al, CVPR2014.
+ Spatiotemporal deformable part models for action detection - Y. Tian et al, CVPR2013.
+ Action localization in videos through context walk - K. Soomro et al, ICCV2015.
+ Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015.
diff --git a/docs/en/tutorials/TSM.md b/docs/en/tutorials/TSM.md
new file mode 100644
index 0000000000000000000000000000000000000000..a0db9aa9bcecee669f739776174b9798127994ab
--- /dev/null
+++ b/docs/en/tutorials/TSM.md
@@ -0,0 +1,73 @@
+# 1. Background&Motivation
+At present, the video data on the Internet is increasing rapidly, and the time users spend watching short videos and small videos is also increasing rapidly. How to analyze, process and classify the massive video resources quickly and accurately is an urgent problem to be solved. The video understanding technology can analyze the video content in multiple dimensions, understand the video semantics, and automatically classify and label the video, which greatly saves the efficiency of manual audit and costs. At the same time, accurate user recommendation is realized to improve the experience effect.
+In this paper, we will introduce the classic model **TSM (Temporal Shift Module)** in the field of video understanding, which is proposed by **MIT** and **IBM Watson AI Lab** `Ji Lin, Chuang Gan and Songhan, etc`, to achieve the balance between effeiciency and performance and improve video understanding ability.
+
+The most relevant video understanding model to TSM is the **Temporal Segment Network (TSN)** published by Limin Wang
+a series of works represented such as I3D, S3D and P3D, which carry out end-to-end joint spatial-temporal modeling through 3D convolution. Although this series of works can capture spatial-temporal features, compared with TSN, the transition from 2D convolution to 3D convolution inevitably introduces extra computation. TSM cleverly uses the idea of temporal dimension feature map shift, theoretically achieving the purpose of feature fusion and joint modeling among different frames with zero extra computing overhead compared with TSN.
+
+**Paper Address:** [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383v2.pdf)
+
+Let's have a look at the following example: if the video is played from left to right and then from right to left respectively, the subjects will give different but correct interpretation of the video, indicating that the understanding of the video is strongly dependent on the temporal information of the video. Yes !, It is the motivation why TSM is proposed.
+
+
+
+
+It looks interesting, next,let's dive into the core modules of TSM.
+
+# 2. Dark technologies used in TSM
+
+On the basis of traditional image analysis, video analysis needs researchers to supplement the modeling structure of temporal information. At present, 2D CNN and 3D CNN are the two most commonly used methods in video understanding: using 2D CNN model requires less computation but will lose part of the time information; While using 3D CNN has a good effect but a large amount of computation. Faced with such a situation, Ji Lin, Chuang Gan and Song Han et al. from MIT and IBM Watson AI Lab proposed the Temp Shift Module (TSM) Module. By embedding the time displacement module into 2D CNN, they can easily achieve the same video understanding ability as 3D CNN without adding any additional calculation and parameters.
+
+
+
+
+The rows and columns of the matrix in the figure above represent the temporal and channel dimensions of the feature graph, respectively. In TSM module, some channels are moved forward one step int the temporal dimension, and some channels are moved backward one step in the temporal dimension, and the gaps after the displacement are zeroed. In this way, context interaction on the temporal dimension is introduced into the feature graph. The channel movement operation can make the current frame contain the channel information of the two adjacent frames. In this way, the 2D convolution operation can directly extract the spatial-temporal information of the video just like the 3D convolution.
+It improves the modeling ability of the model in time dimension. based on this basis, the researchers further subdivided the module into TSM module suitable for online video and TSM module suitable for offline video.
+
+
+
+
+Bi-Direction TSM module can obtain past and future spatial and temporal information, which is suitable for offline video with high throughput. However, UNI-Direction TSM module is only suitable for low delay online video recognition compared with the present and past spatio-temporal information.
+In addition, the author also considered the insertion position of TSM modules and compared two TSM insertion methods: **Residual TSM** and **in-place TSM**. The author found that **Residual TSM** could achieve better performance than **in-place TSM**, At the same time, author explained that **in-place TSM** may affect the extraction of spatial information.
+
+
+
+
+TSM module looks **So Easy!!**, the next question is how to implement ?
+
+# 3. The core codes of TSM
+
+Now that the principle is clear, let's look at how the code works. First let's have a look the torch version tsm. Unfortunately, the Torch framework does not provide an API for TSM, so we will have to do it by ourselves. The code is shown below:
+
+
+
+
+This means that you only need to add four lines of code to TSN's codebase then you can **double the accuracy in Something-Something datasets!!** what a simple and efficient model!
+
+But...,
+
+**paddlepaddle** framework take the needs of the majority of users into account and have achieve TSM OP,then users can use it easily.
+
+
+
+
+So you no longer have to achieve it by yourself, **it cab be called directly!!! , it can be called directly!!! , it can be called directly!!!** The important thing must say three times.
+
+Do you think that it is the end of the this topic? **Too young Too simple !!!**
+
+We have also optimized it to increase speed by 5 times while reducing memory consumption. See the acceleration documentation [accelerate.md](./accelerate.md) for more information.
+
+Let's have a look at how TSM is implemented using **paddlepaddle**:
+
+`import paddle.nn.functional as F`
+
+
+`shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)`
+
+**Only two lines codes !!!**, isn't it easy ?
+
+# Reference
+[1] [Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018](https://arxiv.org/pdf/1811.08383v2.pdf).
+
+
+[2] [Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016](https://arxiv.org/abs/1608.00859).
diff --git a/docs/en/tutorials/Temporal Action Detection Papers b/docs/en/tutorials/Temporal Action Detection Papers
new file mode 100644
index 0000000000000000000000000000000000000000..dc475d3de6a69a614df061e6321061fcf900f506
--- /dev/null
+++ b/docs/en/tutorials/Temporal Action Detection Papers
@@ -0,0 +1,24 @@
+Usefull Temporal Action Detection Papers.
+
+ Rethinking the Faster R-CNN Architecture for Temporal Action Localization - Yu-Wei Chao et al., CVPR2018
+ Weakly Supervised Action Localization by Sparse Temporal Pooling Network - Phuc Nguyen et al., CVPR 2018
+ Temporal Deformable Residual Networks for Action Segmentation in Videos - P. Lei and S. Todrovic., CVPR2018.
+ End-to-End, Single-Stream Temporal Action Detection in Untrimmed Videos - Shayamal Buch et al., BMVC 2017
+ Cascaded Boundary Regression for Temporal Action Detection - Jiyang Gao et al., BMVC 2017
+ Temporal Tessellation: A Unified Approach for Video Analysis - Kaufman et al., ICCV2017.
+ Temporal Action Detection with Structured Segment Networks - Y. Zhao et al., ICCV2017.
+ Temporal Context Network for Activity Localization in Videos - X. Dai et al., ICCV2017.
+ Detecting the Moment of Completion: Temporal Models for Localising Action Completion - F. Heidarivincheh et al., arXiv2017.
+ CDC: Convolutional-De-Convolutional Networks for Precise Temporal Action Localization in Untrimmed Videos - Z. Shou et al, CVPR2017.
+ SST: Single-Stream Temporal Action Proposals - S. Buch et al, CVPR2017.
+ R-C3D: Region Convolutional 3D Network for Temporal Activity Detection - H. Xu et al, arXiv2017. [code] [project web] [PyTorch]
+ DAPs: Deep Action Proposals for Action Understanding - V. Escorcia et al, ECCV2016.
+ Online Action Detection using Joint Classification-Regression Recurrent Neural Networks - Y. Li et al, ECCV2016.
+ Temporal Action Localization in Untrimmed Videos via Multi-stage CNNs - Z. Shou et al, CVPR2016.
+ Fast Temporal Activity Proposals for Efficient Detection of Human Actions in Untrimmed Videos - F. Heilbron et al, CVPR2016.
+ Actionness Estimation Using Hybrid Fully Convolutional Networks - L. Wang et al, CVPR2016.
+ Learning Activity Progression in LSTMs for Activity Detection and Early Detection - S. Ma et al, CVPR2016.
+ End-to-end Learning of Action Detection from Frame Glimpses in Videos - S. Yeung et al, CVPR2016.
+ Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015.
+ Bag-of-fragments: Selecting and encoding video fragments for event detection and recounting - P. Mettes et al, ICMR2015.
+ Action localization in videos through context walk - K. Soomro et al, ICCV2015.
diff --git a/docs/en/tutorials/accelerate.md b/docs/en/tutorials/accelerate.md
new file mode 100644
index 0000000000000000000000000000000000000000..da2032d192de4770b0064e226e75f79814e489ec
--- /dev/null
+++ b/docs/en/tutorials/accelerate.md
@@ -0,0 +1 @@
+[简体中文](../../zh-CN/tutorials/accelerate.md) | English
diff --git a/docs/en/tutorials/config.md b/docs/en/tutorials/config.md
new file mode 100644
index 0000000000000000000000000000000000000000..20b3e48803c08e2d1e7c9ace0016a7d7942c5a81
--- /dev/null
+++ b/docs/en/tutorials/config.md
@@ -0,0 +1,131 @@
+# Configs design
+
+---
+This page shows how PaddleVideo use the basic IOC/DI technology to decouple and control the whole framework. It is flexible to increase modularity of this system and make it extensible. At last, we will explain the details of config yaml and script args.
+
+
+## Design
+
+First, when we create a new class, it is common to new a instance like:
+
+```python
+class TSM():
+ pass
+
+model = TSM(init_attributes)
+```
+
+when more classes are created, the coupling relationship between the calling and called method will increase sharply, obviously, we can create a factory class to solve it, like that:
+
+```python
+if model_name == "TSM":
+ model = TSM()
+elif model_name == "TSN":
+ model = TSN()
+elif ...
+```
+and
+
+```python
+optimizer_cfg = dict(name:"MOMENTUM", params: XXX)
+if optimizer_cfg.name = "MOMENTUM":
+ optimizer = MOMENTUM(optimizer_cfg.pop(name))
+elif:
+ ...
+```
+
+more and more conditions have to be created though. like widly used in the Java or other platforms, we apply ```inversion of control``` and ```Dependency Inversion``` to decuople.
+
+Second, to implenment DI, we build two components:
+
+- Register, to regist a class
+- Builder, to new an instance
+
+1. Register
+
+We implenment a getter and a setter function to map string to an instance.
+[source code](../../paddlevideo/utils/registry.py)
+
+```python
+#excerpt from source code.
+class Registry():
+ def __init__(self, name):
+ self._name = name
+ self._obj_map = {}
+
+ #mapping name -> object
+ def register(self, obj, name):
+ self._obj_map[name] = obj
+
+ #get object
+ def get(self, name):
+ ret = self._obj_map.get(name)
+ return ret
+```
+
+It provides name -> object mapping. For example, To register an object:
+```python
+
+ BACKBONES = Registry('backbone')
+ class ResNet:
+ pass
+ BACKBONES.register(ResNet)
+```
+
+Or, use a decorator
+```python
+ BACKBONES = Registry('backbone') #new a Register
+ @BACKBONES.register() #regist resnet as a backbone.
+ class ResNet:
+ pass
+```
+
+2. Builder
+
+To obtain a registed module.
+```python
+ # Usage: To build a module.
+
+ backbone_name = "ResNet"
+ b = BACKBONES.get(backbone_name)()
+```
+
+so that we can new(register) an instance in **where it declared**, not **where it called**, a basic DI sub-system has been created now.
+
+We apply this design on many places, such as: PIPELINE, BACKBONE, HEAD, LOSS, METRIC and so on.
+
+Finally, We build all of the framework components from config yaml which matches the source code one by one, **It means the attributes in a configuration field is same as the init atrributes of the mathced class**, and to indicate a specified class, we always use ```name``` to mark it. like:
+
+```yaml
+head:
+ name: "TSMHead" # class name
+ num_classes: 400 # TSMHead class init attributes
+ ...
+```
+
+---
+
+## config yaml details
+
+We separate the config to several parts, in high level:
+
+- **MODEL:** Architecture configuration, such as HEAD module, BACKBONE module.
+- **DATASET:** DATASET and dataloader configuration.
+- **PIPELINE:** pipeline of processing configuration.
+- **OPTIMIZER:** Optimizer configuration.
+
+and some unique global configurations, like
+- model_name
+- log_interval
+- epochs
+- resume_epoch
+- log_level
+...
+
+Training script args
+
+- **--validate**: switch validate mode on or not
+- **--test**: switch test mode on or not
+- **--weights**: weights path
+- **-c**: config yaml path
+- **-o**: override args, one can use it like: -o DATASET.batch_size=16
diff --git a/docs/en/tutorials/customized_usage.md b/docs/en/tutorials/customized_usage.md
new file mode 100644
index 0000000000000000000000000000000000000000..ca348179c9a11cd2142ddfb2ab734eda85b237b1
--- /dev/null
+++ b/docs/en/tutorials/customized_usage.md
@@ -0,0 +1,44 @@
+[简体中文](../../zh-CN/tutorials/customized_usage.md) | English
+
+# Customized Usage
+
+## Customized Dataset
+
+1. finetune
+
+Please refer to [finetune](../start.md#model_finetune) if only change a "regular" dataset.
+
+2. customized pipeline
+
+ - add new augments
+ - add new batch augments
+ **Note**: Be care of checking the difference of different modes.
+
+## Customized Network
+
+1. module function
+
+Please refer to [modular desigh](modular_design.md) for more information.
+
+2. customized framework
+
+ - change framework
+ - change initialized function
+ - customized loss
+
+## Customized Solvers
+
+1. step decay and epoch decay
+
+2. customized solvers
+
+## Customized metrics
+
+ - add new data processing
+ - add new record
+ - add new metrics
+
+## Debug tools
+
+1. Debug level
+2. FAQ
diff --git a/docs/en/tutorials/demos b/docs/en/tutorials/demos
new file mode 100644
index 0000000000000000000000000000000000000000..2228d3048b64378830f8e7a15f72870d1d09dcb2
--- /dev/null
+++ b/docs/en/tutorials/demos
@@ -0,0 +1,8 @@
+some useful demo todo.
+
+1、single-class action recognition, tsn/tsm/slowfast
+2、multi-class action recognition,lstm
+3、action localization,bmn
+4、spatio temporal action detection,todo
+5、3000-class tagging application(videotag):tsn+lstm
+6、Highlights detection application:bmn+tsn+lstm
diff --git a/docs/en/tutorials/deployment.md b/docs/en/tutorials/deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..c88329f4572d6d61e10aad3f4741dd9bc3dff90f
--- /dev/null
+++ b/docs/en/tutorials/deployment.md
@@ -0,0 +1,48 @@
+[简体中文](../../zh-CN/tutorials/deployment.md) | English
+
+# Inference
+
+## How to convert dygraph model to static model?
+To infer and deploy a model, we need export an inference model, or called to_static: `convert dygraph model to static model`, at first.
+
+```python
+python3.7 tools/export_model.py -c config_file -o output_path -p params_file
+```
+
+Note: In `export_model.py`, It will build a model again, and then loading the prarams. But some init params in the infer phase is different from the train phase.
+we add `num_seg` for TSM in advanced, please add more params or modify them if it is necessary.
+please refer to [official documents](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/04_dygraph_to_static/index_cn.html) for more information.
+
+## How to test the export model?
+
+PaddleVideo supports a test script to test the exported model.
+
+```python
+python3.7 tools/test_export_model.py -p params_file -i inference_folder -c config_file
+```
+
+We just print the output shape, please feel free to extend it. Avtually, only test a video file by PaddleInference can make sure the exported model is right.
+
+## How to use PaddleInference?
+PaddleVideo supports ```tools/predict.py``` to infer
+
+```python
+python3.7 tools/predict.py -v example.avi --model_file "./inference/example.pdmodel" --params_file "./inference/example.pdiparams" --enable_benchmark=False --model="example" --num_seg=8
+ ```
+
+## How to test inference speed?
+PaddleVideo support a script to test inference speed
+
+```python
+python3.7 tools/predict.py --enable_benchmark=True --model_file=模型文件 --params_file=参数文件
+```
+## How to use C++ infer?
+ coming soon
+
+# Deployment
+
+## How to use PaddleHub Serving deploy?
+ coming soon
+
+## How to use PaddleLite deploy?
+ coming soon
diff --git a/docs/en/tutorials/modular_design.md b/docs/en/tutorials/modular_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..a426ef5c736645719a3aa6b7bf2ecec4e5d6c046
--- /dev/null
+++ b/docs/en/tutorials/modular_design.md
@@ -0,0 +1 @@
+[简体中文](../../zh-CN/tutorials/modular_design.md) | English
diff --git a/docs/en/tutorials/pp-tsm.md b/docs/en/tutorials/pp-tsm.md
new file mode 100644
index 0000000000000000000000000000000000000000..d3ed2dbb7a6eb60f528f31ec68f2e85f02a0c8a9
--- /dev/null
+++ b/docs/en/tutorials/pp-tsm.md
@@ -0,0 +1,32 @@
+# High performance recognition 2D architecture PP-TSM
+
+PP-TSM:An Effective and Efficient video-recognition model
+
+PP-TSM is an optimized model based on TSM in PaddleVideo,
+whose performance (top-1 on UCF101 and Kinetics400) and inference spped
+are better than TSM paper(https://arxiv.org/abs/1811.08383 ) and
+other open source TSM,PaddlePaddle2.0(available on pip now) or
+Daily Version( https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev )
+is required to run PP-TSM.
+
+When only use ImageNet for pretrain and only use 8X1 sample,
+PP-TSM’s top1 reached to 89.5% and 73.5% on UCF101 and Kinetics400,
+and inference speed of FP32 on single V100 is 147 VPS on Kinectics400 dataset.
+inference speed of FP16 with TensorRT on single V100 isTODO.
+
+As far as we know, under the same conditions,
+top1=73.5% on Kinetics400 is the best performance for 2D video model until now.
+
+
+PP-TSM improved performance and speed of TSM with following methods:
+1、Model Tweaks: ResNet50vd ,+2.5%
+2、ImageNet pretrain weights based on Knowledge Distillation , +1.3%
+3、beter batch size ,+0.2%
+4、beter L2 ,+0.3%
+5、label_smoothing ,+0.2%
+6、beter lr decay ,+0.15%
+7、Data augmentation ,+0.3%
+8、beter epoch num ,+0.15%
+9、bn strategy ,+0.4%
+10、integrated PaddleInference
+11、more strategies todo: Knowledge Distillation、optimizer and so on.
diff --git a/docs/en/tutorials/summarize.md b/docs/en/tutorials/summarize.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bd49bf7dcc79adc6135935004f8f922bd1a0a66
--- /dev/null
+++ b/docs/en/tutorials/summarize.md
@@ -0,0 +1,208 @@
+[简体中文](../../zh-CN/tutorials/summarize.md) | English
+
+# Introduction for video classification(action recognition)
+
+## Wide range of application scenarios
+Video classification has a wide range of applications in many fields, such as online video platforms such as short videos, offline such as security, transportation, quality inspection and other fields。
+
+
+## Multiple subtasks
+Similar to image tasks, video tasks can also be divided into two categories: **classification (recognition) and detection**, and these two types of tasks can be specifically subdivided by combining different scenes:
+
++ Task1:Trimmed Action Recognition. Users input a trimmed video,which contains only single action,then a video tag will be output by model as depicted in fig below:
+
+
+ Action Classification
+
+
+ In terms of the data modality used, classification tasks can be further subdivided into classification based on single modality data, classification based on multi-modality data, classification based on RGB images and classification based on human skeleton, etc, as shown in the figure below:
+
+
+
+ multi-modality
+
+In terms of the perspective of video, it can also be divided into first-person action recognition,
+third-person action recognition, single perspective action recognition and multi-perspective fusion action recognition.
+Users who are interested in these fields can refer to relevant literatures.
+
++ Task2:Untrimmed Video Classification.
+Unlike trimmed videos, untrimmed videos often contain multiple actions and have a long time span.
+There are a lot of movements that we may need not paying attention to. Through the global analysis of the input long video, and then make a soft classify to mutiple categories.
+
++ Task3:Temporal Action Proposal. It is similar to the ROI extraction in the image detection task.
+The task is to find the video clips that may contain action in a long video with a lot of actions.
+
++ Task4:Temporal Action Localization. Compared with the temporal action proposal task as mentioned above,
+temporal action localization task is more consistent with detection task in the field of imgae,
+it requires not only to find the video segments with possible actions from the video but also to classify them,
+as shown in the figure below
+
+
+ Action Detection
+
+
++ Task5:Dense-Captioning Events. The reason why it is called dense captioning events is mainly
+because that this task requires video action description on the basis of temporal action localization
+(detection). That is to say, the task needs to locate the actions in a **untrimmed** video,in **temporal
+dimension** and describe the behavior of the **whole video** after obtaining many video segments which contain actions.
+
+## Introduction of datasets
+
+### Classification datasets
+
+The training and validation of the model cannot be done without comprehensive,
+large and well annotated datasets. With the deepening of research on video action recognition,
+more and more datasets are applied to the research in this field.
+Typical datasets are as follows:
+
++ KTH[1](#1)
+
+KTH dataset is an early small action recognition dataset,
+including 599 videos of 6 types of actions (walking, jumping, running, punching, waving and clapping).
+The background is relatively still, except for the zoom in and out of the camera,
+the camera movement is relatively slight. Since this data set is relatively small,
+it is easy to overfit when training heavy 3D networks,
+so most current researches are not based on this it.
+
++ UCF10[2](#2)
+
+UCF101 is a medium-size dataset in which most videos are from YouTube.
+It contains 13,320 videos with 101 types of actions.
+Each type of action is performed by 25 people, each of whom performs 4-7 sets of actions.
+The UCF101 and HMDB51 datasets used to be the benchmarks to evaluate the effectiveness of action
+recognition model for a long time before the Kinetics dataset was released.
+
++ HMDB51[3](#3)
+
+Brown University's proposed dataset named HMDB51 was released in 2011.
+Most of the videos come from movies,
+but some come from public databases and online video libraries such as YouTube.
+The datasets contains 6849 samples divided into 51 classes,
+each of which contains at least 101 samples.
+
++ Kinetics[4](#4)
+
+Kinetics is the most important large-scale action recognition dataset, which was proposed by Google's DeepMind team in 2017. The video data also comes from YouTube, with 400 categories (now expanded to 700 categories) and more than 300,000 videos (now expanded to 600,000 videos), each lasting about 10 seconds.
+The action categories are mainly divided into three categories: "human", "human and animal", "human and human interaction". Kinetics can train 3D-RESNET up to 152 layers without over-fitting,
+which solves the problem that the previous training dataset is too small to train deep 3D network.
+Kinetics has replaced UCF101 and HMDB51 as the benchmark in the field of action recognition.
+At present, most studies use this dataset for evaluation and pre-training.
+
++ Something-Something[5](#5)
+
+SomethingV1 contains 108,499 annotated videos (V2 has expanded to 220,847), each of which last two to six seconds. These videos contain 174 kinds of actions. Different from the previous dataset,
+the identification of this data set requires stronger time information,
+so this dataset has a very important reference value in testing the temporal modeling ability of the model.
+
+In addition to the above datasets, there are Charades[6](#6) dataset for complex Action recognition, Breakfast Action[7](#7), and Sports 1M[8](#8).
+
+
+### Detection datasets
+
++ THUMOS 2014
+
+This dataset is from THUMOS Challenge 2014, Its training set is UCF101, validation set and test set include 1010 and 1574 undivided video clips respectively. In the action detection task, only 20 kinds of unsegmented videos of actions were labeled with sequential action fragments,
+including 200 validation sets (3007 action fragments) and 213 test sets (3358 action fragments).
+
++ MEXaction2
+
+The Mexaction2 dataset contains two types of action: horse riding and bullfighting.
+The dataset consists of three parts: YouTube videos, horseback riding videos in UCF101, and INA videos.
+YouTube clips and horseback riding videos in UCF101 are short segmented video clips that are used as training sets.
+The INA video is a long unsegmented video with a total length of 77 hours,
+and it is divided into three parts: training, validation and test.
+There are 1336 action segments in the training set, 310 in the validation set and 329 in the test set.
+Moreover, the Mexaction2 dataset is characterized by very long unsegmented video lengths,
+and marked action segments only account for a very low proportion of the total video length.
+
++ ActivityNet
+
+At present the largest database, also contains two tasks of classification and detection.
+This dataset only provides a YouTube link to the video, not a direct download of the video,
+so you also need to use the YouTube download tool in Python to automatically download the videos.
+The dataset contains 200 action categories, 20,000 (training + verification + test set) videos,
+and a total of about 700 hours of video.
+
+## Introduction of classic models
+As shown in the figure,
+the action recognition framework mainly includes three steps:
+feature extraction, motion representation and classification.
+How to extract spatiotemporal features of video is the core problem of action recognition and video classification.
+
+
+Framework of action recognition
+
+According to different methods, action recognition (video classification) methods can be generally summarized into two stages:
+manual feature-based method and deep learning-based method.
+Typical motion descriptors in the manual feature-based method stage include DTP and IDT,
+which are also the most excellent motion descriptors accepted by most researchers before deep-learning is applied in this field.
+Interinterested readers may refer to the relevant references at the end of this paper.
+Since 2014, deep learning methods have been gradually applied to the field of video classification.
+At present, deep learning-based methods have become a hotspot of research in both academic and the practice, and the effect is far beyond the motion features of manual design.
+Since 2014, many classic network structures have been put forward by the researchers regarding the problem of how to represent motion characteristics,
+as shown in the figure below:
+
+
+Classic Models
+
+
+At present,Paddlevideo has contained several classic models such as:TSN[9](#9),TSM[10](#10),slowfast[11](#11),et al.In the future,
+we will analyze the classic models and papers in these fields. Please look forward to it
+
+
+## Introduction of competetion
++ [ActivityNet](http://activity-net.org/challenges/2020/challenge.html)
+
+ActivityNet is a large-scale action recognition competition. Since 2016,
+it has been held simultaneously with CVPR every year. Up to this year,
+it has been held for 4 consecutive sessions. It focuses on identifying everyday, high-level, goal-oriented activities from
+user-generated videos taken from the Internet video portal YouTube.
+At present, ActivityNet competition has become the most influential competition in the field of action recognition.
+
+## Reference
+
+
+[1] Schuldt C, Laptev I, Caputo B.Recognizing Human Actions: A Local SVM Approach Proceedings of International Conference on Pattern Recognition. Piscataway, NJ: IEEE, 2004:23-26
+
+
+
+[2] Soomro K, Zamir A R, Shah M. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. arXiv:1212.0402,2012.
+
+
+
+[3] Kuehne H, Jhuang H, Garrote E, et al. HMDB: a large video database for human motion recognition Proceedings of IEEE International Conference on Computer Vision. Piscataway, NJ: IEEE, 2011:2556-2563.
+
+
+
+[4] Carreira J , Zisserman A . Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2017:6299-6308.
+
+
+
+[5] Goyal R, Kahou S E, Michalski V. The “something something” video database for learning and evaluating visual common sense. arXiv:1706.04261,2017.
+
+
+
+[6] Sigurdsson G A , Varol Gül, Wang Xiaolong, et al. Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. arXiv: 604.01753,2016
+
+
+
+[7] Kuehne H, Arslan A, Serre T. The Language of Actions Recovering the Syntax and Semantics of Goal-Directed Human Activities Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014.
+
+
+
+[8] Karpathy A , Toderici G , Shetty S , et al. Large-Scale Video Classification with Convolutional Neural Networks Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014:1725-1732.
+
+
+
+[9] Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016.
+
+
+
+[10] Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018.
+
+
+
+[11] Feichtenhofer C , Fan Haoqi , Malik J , et al. SlowFast Networks for Video Recognition. arXiv:1812.03982,2018.
+
+
+
diff --git a/docs/en/usage.md b/docs/en/usage.md
new file mode 100644
index 0000000000000000000000000000000000000000..612fd576799cc33f5ca2c73912965f03d423b16e
--- /dev/null
+++ b/docs/en/usage.md
@@ -0,0 +1,177 @@
+[简体中文](../zh-CN/usage.md) | English
+
+# Usage
+---
+
+Please refer to [installation documents](./install.md) to prepare the enviroment, and follow the steps mentioned in the [data preparation documents](./dataset/) to construct dataset, we will take you through the basic functions supported by PaddleVideo, all of it takes the ucf101 dataset with frame format as example.
+
+PaddleVideo only support linux operation system and GPU running time environment now.
+
+Default detination folder of PaddleVideo files. running the [example config](../../configs/example.yaml) as example.
+
+```
+PaddleVideo
+ ├── paddlevideo
+ ├── ... #other source codes
+ ├── output #ouput destination
+ | ├── example
+ | | ├── example_best.pdparams #path_to_weights
+ | | └── ...
+ | └── ...
+ ├── log #log file destination.
+ | ├── worker.0
+ | ├── worker.1
+ | └── ...
+ └── inference #inference files destination.
+ ├── .pdiparams file
+ ├── .pdimodel file
+ └── .pdiparmas.info file
+```
+
+
+## 1. Train and Test
+
+Start running multi-cards training scripts or test scripts by `paddle.distributed.launch`, or run the `run.sh` directly.
+
+```bash
+sh run.sh
+```
+
+We put all the start commands in advanced in the ```run.sh```, please uncomment the selected one to run.
+
+
+
+### 1.1 Train
+
+Switch `--validate` on to validating while training.
+
+```bash
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ --validate \
+ -c ./configs/example.yaml
+```
+
+Indicating `-c` to set configuration, and one can flexible add `-o` in the script to update it.
+
+```bash
+python -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ -c ./configs/example.yaml \
+ --validate \
+ -o DATASET.batch_size=16
+```
+Indicating `-o DATASET.batch_size=16` can update batch size to 16, please refer to [configuration](tutorials/config.md#config-yaml-details) for more information.
+
+After starting training, log files will generated, and its format is shown as below, it will output to both the screen and files. Default destination of log is under the `.log/` folder, and stored in the files named like `worker.0`, `worker.1` ...
+
+[train phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:
+
+ [12/28 17:31:26] epoch:[ 1/80 ] train step:0 loss: 0.04656 lr: 0.000100 top1: 1.00000 top5: 1.00000 elapse: 0.326 reader: 0.001s ips: 98.22489 instance/sec.
+
+[eval phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:
+
+
+ [12/28 17:31:32] epoch:[ 80/80 ] val step:0 loss: 0.20538 top1: 0.88281 top5: 0.99219 elapse: 1.589 reader: 0.000s ips: 20.14003 instance/sec.
+
+
+[epoch end] current time, metrics, elapse time, ips, etc.
+
+ [12/28 17:31:38] END epoch:80 val loss_avg: 0.52208 top1_avg: 0.84398 top5_avg: 0.97393 elapse_avg: 0.234 reader_avg: 0.000 elapse_sum: 7.021s ips: 136.73686 instance/sec.
+
+[the best Acc]
+
+ [12/28 17:28:42] Already save the best model (top1 acc)0.8494
+
+
+### 1.2 Resume
+
+Indicate `-o resume_epoch` to resume, It will training from ```resume_epoch``` epoch, PaddleVideo will auto load optimizers parameters and checkpoints from `./output` folder, as it is the default output destination.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ -c ./configs/example.yaml \
+ --validate \
+ -o resume_epoch=5
+
+```
+
+
+### 1.3 Finetune
+
+Indicate `--weights` to load pretrained parameters, PaddleVideo will auto treat it as a finetune mission.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ -c ./configs/example.yaml \
+ --validate \
+ --weights=./outputs/example/path_to_weights
+```
+
+Note: PaddleVideo will NOT load shape unmatched parameters.
+
+
+### 1.4 Test
+
+Switch `--test` on to start test mode, and indicate `--weights` to load pretrained model.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ -c ./configs/example.yaml \
+ --test \
+ --weights=./output/example/path_to_weights
+```
+
+
+
+
+## 2. Infer
+
+First, export model.
+Indicate `-c` to set configuration, `-p` to load pretrained model, `-o` to set inference files destination.
+
+```bash
+python tools/export_model.py \
+ -c ./configs/example.yaml \
+ -p ./output/example/path_to_weights \
+ -o ./inference
+```
+
+
+It will generate `model_name.pdmodel` , `model_name.pdiparams` and `model_name.pdiparames.info`.
+Second, start PaddleInference engine to infer a video.
+
+```bash
+python tools/predict.py \
+ --input_file "data/example.avi" \
+ --model_file "./inference/example.pdmodel" \
+ --params_file "./inference/example.pdiparams" \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+Attributes:
++ `input_file`: input file path or input directory, which contains input files(s).
++ `model_file`: pdmodel file path.
++ `params_file`: pdiparams file path.
++ `use_tensorrt`: use tensorrt to acclerate or not, default: False.
++ `use_gpu`: use gpu to infer or not, default: True.
+
+benchmark results are shown in th [benchmark](./benchmark.md).
diff --git a/docs/en/whl_en.md b/docs/en/whl_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd989cc79dfc7de597b2966d579213290e43edee
--- /dev/null
+++ b/docs/en/whl_en.md
@@ -0,0 +1,185 @@
+[简体中文](../zh-CN/whl_zh.md) | English
+# paddlevideo package
+
+## Get started quickly
+
+### install package
+
+install by pypi
+```bash
+python3.7 -m pip install paddlevideo==0.0.1
+```
+**note:** you may have difficulty in installing opencv-python,you can try:
+
+```
+python3.7 -m pip install opencv-python==4.2.0.32 -i https://pypi.doubanio.com/simple
+```
+
+build own whl package and install
+```bash
+python3.7 setup.py bdist_wheel
+python3.7 -m pip install dist/paddlevideo-0.0.1-py3-none-any.whl
+```
+
+### 1. Quick Start
+
+* Assign `video_file='data/example.avi'`, Use inference model that Paddle provides `model_name='ppTSM'`
+
+
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False,use_tensorrt=False)
+video_file='data/example.avi.'
+result = clas.predict(video_file)
+print(result)
+```
+
+```
+ >>> result
+ [{'videoname': 'data/example.avi', 'class_ids': [5], 'scores': [0.9621570706367493], 'label_names': ['archery']}]
+```
+
+* Using command line interactive programming
+```bash
+ppvideo --model_name='ppTSM' --video_file='data/example.avi'
+```
+
+```
+ >>> result
+ **********data/example.avi**********
+ [{'videoname': 'data/example.avi', 'class_ids': [5], 'scores': [0.9621570706367493], 'label_names': ['archery']}]
+```
+
+
+### 2. Definition of Parameters
+* model_name(str): model's name. If not assigning `model_file`and`params_file`, you can assign this param. If using inference model based on Kinectics-400 provided by Paddle, set as default='ppTSM'.
+* video_file(str): video's path. Support assigning single local video, internet video and folder containing series of videos. Also Support numpy.ndarray.
+* use_gpu(bool): Whether to use GPU or not, defalut=False.
+* num_seg(int): Number of segments while using the sample strategies proposed in TSN.
+* seg_len(int): Number of frames for each segment.
+* short_size(int): resize the minima between height and width into resize_short(int), default=256.
+* target_size(int): resize image into resize(int), default=224.
+* normalize(bool): whether normalize image or not, default=True.
+* model_file(str): path of inference.pdmodel. If not assign this param,you need assign `model_name` for downloading.
+* params_file(str): path of inference.pdiparams. If not assign this param,you need assign `model_name` for downloading.
+* batch_size(int): batch number, default=1.
+* use_fp16(bool): Whether to use float16 in memory or not, default=False.
+* use_tensorrt(bool): whether to open tensorrt or not. Using it can greatly promote predict preformance, default=False.
+* gpu_mem(int): GPU memory usages,default=8000.
+* top_k(int): Assign top_k, default=1.
+* enable_mkldnn(bool): whether enable MKLDNN or not, default=False.
+
+
+### 3. Different Usages of Codes
+
+**We provide two ways to use: 1. Python interative programming 2. Bash command line programming**
+
+* check `help` information
+```bash
+ppvideo -h
+```
+
+* Use user-specified model, you need to assign model's path `model_file` and parameters's path`params_file`
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_file='user-specified model path',
+ params_file='parmas path', use_gpu=False, use_tensorrt=False)
+video_file = ''
+result = clas.predict(video_file)
+print(result)
+```
+
+###### bash
+```bash
+ppvideo --model_file='user-specified model path' --params_file='parmas path' --video_file='video path'
+```
+
+
+* Use inference model which PaddlePaddle provides to predict, you need to choose one of model when initializing ppvideo to assign `model_name`. You may not assign `model_file` , and the model you chosen will be download in `BASE_INFERENCE_MODEL_DIR` ,which will be saved in folder named by `model_name`,avoiding overlay different inference model.
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False, use_tensorrt=False)
+video_file = ''
+result = clas.predict(video_file)
+print(result)
+```
+
+###### bash
+```bash
+ppvideo --model_name='ppTSM' --video_file='video path'
+```
+
+* You can assign input as format`np.ndarray` which has been preprocessed `--video_file=np.ndarray`.
+
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False, use_tensorrt=False)
+video_file = np.ndarray
+result = clas.predict(video_file)
+```
+
+###### bash
+```bash
+ppvideo --model_name='ppTSM' --video_file=np.ndarray
+```
+
+* You can assign `video_file` as a folder path containing series of videos, also can assign `top_k`.
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False, use_tensorrt=False,top_k=5)
+video_file = '' # it can be video_file folder path which contains all of videos you want to predict.
+result = clas.predict(video_file)
+print(result)
+```
+
+###### bash
+```bash
+paddleclas --model_name='ResNet50' --video_file='video path' --top_k=5
+```
+
+* You can assign `--label_name_path` as your own label_dict_file, format should be as(class_idclass_name<\n>).
+
+```
+0 abseiling
+1 air_drumming
+2 answering_questions
+3 applauding
+4 applying_cream
+5 archery
+......
+```
+
+* If you use inference model that Paddle provides, you do not need assign `label_name_path`. Program will take `data/k400/Kinetics-400_label_list.txt` as defaults. If you hope using your own training model, you can provide `label_name_path` outputing 'label_name' and scores, otherwise no 'label_name' in output information.
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_file= './inference.pdmodel',params_file = './inference.pdiparams',label_name_path='./data/k400/Kinetics-400_label_list.txt',use_gpu=False)
+video_file = '' # it can be video_file folder path which contains all of videos you want to predict.
+result = clas.predict(video_file)
+print(result)
+```
+###### bash
+```bash
+ppvideo --model_file= './inference.pdmodel' --params_file = './inference.pdiparams' --video_file='video path' --label_name_path='./data/k400/Kinetics-400_label_list.txt'
+```
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False)
+video_file = '' # it can be video_file folder path which contains all of videos you want to predict.
+result = clas.predict(video_file)
+print(result)
+```
+###### bash
+```bash
+ppvideo --model_name='ppTSM' --video_file='video path'
+```
diff --git a/docs/images/BMN.png b/docs/images/BMN.png
new file mode 100644
index 0000000000000000000000000000000000000000..ea0519812260ac1ba180d8e5de5b78f8f63c3edd
Binary files /dev/null and b/docs/images/BMN.png differ
diff --git a/docs/images/FootballAction.gif b/docs/images/FootballAction.gif
new file mode 100644
index 0000000000000000000000000000000000000000..2447251768429225baf8ab0b377dcedbb1d10bf2
Binary files /dev/null and b/docs/images/FootballAction.gif differ
diff --git a/docs/images/SlowFast.png b/docs/images/SlowFast.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b7db836fe38a94f55c92ba93601e0f07dc9b2ac
Binary files /dev/null and b/docs/images/SlowFast.png differ
diff --git a/docs/images/VideoTag.gif b/docs/images/VideoTag.gif
new file mode 100644
index 0000000000000000000000000000000000000000..e60ddfbe4150db86afedf4e6bbd4170e6d8effb2
Binary files /dev/null and b/docs/images/VideoTag.gif differ
diff --git a/docs/images/acc_vps.jpeg b/docs/images/acc_vps.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3b42cdd2dca6b3d9f7e9ee1ae5fd1d4378c4c194
Binary files /dev/null and b/docs/images/acc_vps.jpeg differ
diff --git a/docs/images/actbert.png b/docs/images/actbert.png
new file mode 100644
index 0000000000000000000000000000000000000000..40b21e2c57d6b4a11757108c3bee3259a652c3da
Binary files /dev/null and b/docs/images/actbert.png differ
diff --git a/docs/images/action_classification.png b/docs/images/action_classification.png
new file mode 100644
index 0000000000000000000000000000000000000000..13e7f698c1c0cdfdf1776f49487f8458fd179aac
Binary files /dev/null and b/docs/images/action_classification.png differ
diff --git a/docs/images/action_detection.png b/docs/images/action_detection.png
new file mode 100644
index 0000000000000000000000000000000000000000..9ddbd6234efaa06462a981c1923219a002b937e4
Binary files /dev/null and b/docs/images/action_detection.png differ
diff --git a/docs/images/action_framework.png b/docs/images/action_framework.png
new file mode 100644
index 0000000000000000000000000000000000000000..7cc33271ae010d808a94a1eb220d851d48d74dc1
Binary files /dev/null and b/docs/images/action_framework.png differ
diff --git a/docs/images/application.png b/docs/images/application.png
new file mode 100644
index 0000000000000000000000000000000000000000..7772408987d79573768b1ab66538fa8f2f5c586b
Binary files /dev/null and b/docs/images/application.png differ
diff --git a/docs/images/asrf.png b/docs/images/asrf.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f49edde2b53af9b8010f4e8ccac2ec6a18724d5
Binary files /dev/null and b/docs/images/asrf.png differ
diff --git a/docs/images/cfbi.png b/docs/images/cfbi.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc34629ffe75e089eebb5e7aaf48ba3fa2045446
Binary files /dev/null and b/docs/images/cfbi.png differ
diff --git a/docs/images/classic_model.png b/docs/images/classic_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..21e849e1215037e92459a6aa6e678c3195b5fdc0
Binary files /dev/null and b/docs/images/classic_model.png differ
diff --git a/docs/images/contribute/001_fork.png b/docs/images/contribute/001_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..50a920dc0e5b4f8b5eb93219471bde699105d2ff
Binary files /dev/null and b/docs/images/contribute/001_fork.png differ
diff --git a/docs/images/contribute/002_clone.png b/docs/images/contribute/002_clone.png
new file mode 100644
index 0000000000000000000000000000000000000000..484e24f4319601ca0a15d2f8ca8e238c44d78e38
Binary files /dev/null and b/docs/images/contribute/002_clone.png differ
diff --git a/docs/images/contribute/003_precommit.png b/docs/images/contribute/003_precommit.png
new file mode 100644
index 0000000000000000000000000000000000000000..067fb75ddb222ab0b9c71a46619c3fe7b239bc26
Binary files /dev/null and b/docs/images/contribute/003_precommit.png differ
diff --git a/docs/images/contribute/004_pr.png b/docs/images/contribute/004_pr.png
new file mode 100644
index 0000000000000000000000000000000000000000..489141610c32f45066deb0dd2f8364ee578d68c5
Binary files /dev/null and b/docs/images/contribute/004_pr.png differ
diff --git a/docs/images/ctrgcn.jpg b/docs/images/ctrgcn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..899da363939508c36ed8f6b7f28e4a4692935776
Binary files /dev/null and b/docs/images/ctrgcn.jpg differ
diff --git a/docs/images/home.gif b/docs/images/home.gif
new file mode 100644
index 0000000000000000000000000000000000000000..1335bb00d7fe19f6ad3df0923210867c8935107c
Binary files /dev/null and b/docs/images/home.gif differ
diff --git a/docs/images/i3d_compare.jpg b/docs/images/i3d_compare.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..548e9daabf561c4290d671f495590de8b5482038
Binary files /dev/null and b/docs/images/i3d_compare.jpg differ
diff --git a/docs/images/i3d_expand.jpg b/docs/images/i3d_expand.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c183e1cb256f9f57d69b7fe69fc61d9fc72e552a
Binary files /dev/null and b/docs/images/i3d_expand.jpg differ
diff --git a/docs/images/i3d_expriment1.jpg b/docs/images/i3d_expriment1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..daee822d8a21b76aa08b5b06874d7d295bc3c87c
Binary files /dev/null and b/docs/images/i3d_expriment1.jpg differ
diff --git a/docs/images/i3d_expriment2.jpg b/docs/images/i3d_expriment2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..499188e000463d33385c0d6254fb50f1f2376aaa
Binary files /dev/null and b/docs/images/i3d_expriment2.jpg differ
diff --git a/docs/images/joinus.PNG b/docs/images/joinus.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..00da92edadca77f4f256f299dc2d0ceb17bcbe7d
Binary files /dev/null and b/docs/images/joinus.PNG differ
diff --git a/docs/images/mstcn.PNG b/docs/images/mstcn.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..354b5308817dcaf29e1a04972d42c7577eabc54b
Binary files /dev/null and b/docs/images/mstcn.PNG differ
diff --git a/docs/images/multimodality.png b/docs/images/multimodality.png
new file mode 100644
index 0000000000000000000000000000000000000000..22c4f3b2e94fb0a1290bc0c9e89ece051e0aed6f
Binary files /dev/null and b/docs/images/multimodality.png differ
diff --git a/docs/images/oxford_image.png b/docs/images/oxford_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..5c1f090947e9784c0e7685cfeb9a54ea66c38fb9
Binary files /dev/null and b/docs/images/oxford_image.png differ
diff --git a/docs/images/oxford_image_depth.png b/docs/images/oxford_image_depth.png
new file mode 100644
index 0000000000000000000000000000000000000000..ad74126b50120cb47a2b13f905d400f793b948c1
Binary files /dev/null and b/docs/images/oxford_image_depth.png differ
diff --git a/docs/images/residual_tsm.png b/docs/images/residual_tsm.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7e1dcb458b0dcca943d9012dc1f701e077fe064
Binary files /dev/null and b/docs/images/residual_tsm.png differ
diff --git a/docs/images/skeleton_example.png b/docs/images/skeleton_example.png
new file mode 100644
index 0000000000000000000000000000000000000000..701603b25e2cb60128f1bef2ea92e0bf13b9216c
Binary files /dev/null and b/docs/images/skeleton_example.png differ
diff --git a/docs/images/slowfast_network.jpg b/docs/images/slowfast_network.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8ce3b9e9579cc047d252935a12775c30f4a24523
Binary files /dev/null and b/docs/images/slowfast_network.jpg differ
diff --git a/docs/images/slowfast_structure.jpg b/docs/images/slowfast_structure.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..17b955efeb96f3ba84d84c11ffebbde86d0a1731
Binary files /dev/null and b/docs/images/slowfast_structure.jpg differ
diff --git a/docs/images/st-gcn.png b/docs/images/st-gcn.png
new file mode 100644
index 0000000000000000000000000000000000000000..a52c4277d74fb9dbf270b19824f641a31fe41e47
Binary files /dev/null and b/docs/images/st-gcn.png differ
diff --git a/docs/images/temporal.png b/docs/images/temporal.png
new file mode 100644
index 0000000000000000000000000000000000000000..20cde2e9598590ee49d5fd5aef5c4a7d96bbff13
Binary files /dev/null and b/docs/images/temporal.png differ
diff --git a/docs/images/timesformer_attention_arch.png b/docs/images/timesformer_attention_arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..4d331f12cb343a37cbcefd11ac1e9a7220660791
Binary files /dev/null and b/docs/images/timesformer_attention_arch.png differ
diff --git a/docs/images/timesformer_attention_visualize.png b/docs/images/timesformer_attention_visualize.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7546ede2ddf98d6ba1ae5293e960f6b06abbfe1
Binary files /dev/null and b/docs/images/timesformer_attention_visualize.png differ
diff --git a/docs/images/torch_tsm.png b/docs/images/torch_tsm.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4fde0cca78f6a708e071e2aa5cc11cdc861ccd9
Binary files /dev/null and b/docs/images/torch_tsm.png differ
diff --git a/docs/images/transnetv2.png b/docs/images/transnetv2.png
new file mode 100644
index 0000000000000000000000000000000000000000..8b48e8c6bfa9112c4c17121d88d6a7158e5aefab
Binary files /dev/null and b/docs/images/transnetv2.png differ
diff --git a/docs/images/tsm_architecture.png b/docs/images/tsm_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..286792350b8d1498780f6c3dcd987031ec87321a
Binary files /dev/null and b/docs/images/tsm_architecture.png differ
diff --git a/docs/images/tsm_intr.png b/docs/images/tsm_intr.png
new file mode 100644
index 0000000000000000000000000000000000000000..c8e32e7328b0970d28fe209a81029a4cf2b35e11
Binary files /dev/null and b/docs/images/tsm_intr.png differ
diff --git a/docs/images/tsm_op.png b/docs/images/tsm_op.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc853257558b7ed5c80b24c6d56a39c3e12e81ff
Binary files /dev/null and b/docs/images/tsm_op.png differ
diff --git a/docs/images/tsn_architecture.png b/docs/images/tsn_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..d605f089d2a5d4b55193c283ed566c712bf12180
Binary files /dev/null and b/docs/images/tsn_architecture.png differ
diff --git a/docs/images/tsn_input.jpg b/docs/images/tsn_input.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..391179c55ba8c457cee7de7b38a881e8095dcadc
Binary files /dev/null and b/docs/images/tsn_input.jpg differ
diff --git a/docs/images/tsn_structure.jpg b/docs/images/tsn_structure.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f7d1ddb9e03b00eb137c3e479a7bbd43a7ecff27
Binary files /dev/null and b/docs/images/tsn_structure.jpg differ
diff --git a/docs/images/user_group.png b/docs/images/user_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3dbfb33b31b8dd3e9fb2bbad211911621f9b20d
Binary files /dev/null and b/docs/images/user_group.png differ
diff --git a/docs/images/videodata.png b/docs/images/videodata.png
new file mode 100644
index 0000000000000000000000000000000000000000..f1400fec4f46a1041e1d8b7380cd7d42ce2cec7e
Binary files /dev/null and b/docs/images/videodata.png differ
diff --git a/docs/images/videoswin.jpg b/docs/images/videoswin.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1d16dbb054bc9055ff8d30ddb55078eac83cfd9c
Binary files /dev/null and b/docs/images/videoswin.jpg differ
diff --git a/docs/zh-CN/benchmark.md b/docs/zh-CN/benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..6624206a92a885c0f9888f8454c914718c63d0b6
--- /dev/null
+++ b/docs/zh-CN/benchmark.md
@@ -0,0 +1,68 @@
+简体中文 | [English](../en/benchmark.md)
+
+# Benchmark
+
+此文档主要对比PaddleVideo模型库与主流模型库的训练速度。
+
+
+## 环境配置
+### 硬件环境
+
+- 8 NVIDIA Tesla V100 (16G) GPUs
+- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+### 软件环境
+
+- Python 3.7
+- PaddlePaddle2.0
+- CUDA 10.1
+- CUDNN 7.6.3
+- NCCL 2.1.15
+- GCC 8.2.0
+
+
+### 实验与评价指标
+
+实验中我们测量了平均训练时间,包括数据处理和模型训练两个部分,训练速度均采用每秒钟训练的样本数量(ips)来计量,
+数值越大说明训练速度越快,并且考虑到机器预热的问题,前50次迭代的时间没有被计算在内。
+
+在相同的数据和模型配置下对比了PaddleVideo和其他的视频理解工具箱,为了保证比较的公平性,对比实验都是在相同的硬件条件下进行,实验所用数据请参考[数据准备](./dataset/k400.md)
+观察下表可以发现PaddleVideo相比其他的视频理解框架在训练速度方面取得了巨大的提升,尤其是[Slowfast](../../configs/recognition/slowfast/slowfast.yaml)模型取得了将近一倍的训练速度的提升。
+对于每一种模型配置,我们采用了相同的数据预处理方法并且保证输入是相同的。
+
+## 实验结果
+### 分类模型
+
+| Model | batch size x gpus | PaddleVideo(ips) | Reference(ips) | MMAction2 (ips) | PySlowFast (ips)|
+| :------: | :-------------------:|:---------------:|:---------------: | :---------------: |:---------------: |
+| [TSM](../../configs/recognition/tsm/tsm.yaml) | 16x8 | 58.1 | 46.04(temporal-shift-module) | To do | X |
+| [PPTSM](../../configs/recognition/tsm/pptsm.yaml) | 16x8 | 57.6 | X | X | X |
+| [TSN](../../configs/recognition/tsn/tsn.yaml) | 16x8 | To do | To do (tsn-pytorch) | To do | X |
+| [Slowfast](../../configs/recognition/slowfast/slowfast.yaml)| 16x8 | 99.5 | X | To do | 43.2 |
+| [Attention_LSTM](../../configs/recognition/attention_lstm/attention_lstm.yaml) | 128x8 | 112.6 | X | X | X |
+
+
+### 定位模型
+
+| Model | PaddleVideo(ips) |MMAction2 (ips) |BMN(boundary matching network) (ips)|
+| :--- | :---------------: | :-------------------------------------: | :-------------------------------------: |
+| [BMN](../../configs/localization/bmn.yaml) | 43.84 | x | x |
+
+### 分割模型
+
+本仓库提供经典和热门时序动作分割模型的性能和精度对比
+
+| Model | Metrics | Value | Flops(M) |Params(M) | test time(ms) bs=1 | test time(ms) bs=2 | inference time(ms) bs=1 | inference time(ms) bs=2 |
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| MS-TCN | F1@0.5 | 38.8% | 791.360 | 0.8 | 170 | - | 10.68 | - |
+| ASRF | F1@0.5 | 55.7% | 1,283.328 | 1.3 | 190 | - | 16.34 | - |
+
+* 模型名称:填写模型的具体名字,比如PP-TSM
+* Metrics:填写模型测试时所用的指标,使用的数据集为**breakfast**
+* Value:填写Metrics指标对应的数值,一般保留小数点后两位
+* Flops(M):模型一次前向运算所需的浮点运算量,可以调用PaddleVideo/tools/summary.py脚本计算(不同模型可能需要稍作修改),保留小数点后一位,使用数据**输入形状为(1, 2048, 1000)的张量**测得
+* Params(M):模型参数量,和Flops一起会被脚本计算出来,保留小数点后一位
+* test time(ms) bs=1:python脚本开batchsize=1测试时,一个样本所需的耗时,保留小数点后两位。测试使用的数据集为**breakfast**。
+* test time(ms) bs=2:python脚本开batchsize=2测试时,一个样本所需的耗时,保留小数点后两位。时序动作分割模型一般是全卷积网络,所以训练、测试和推理的batch_size都是1。测试使用的数据集为**breakfast**。
+* inference time(ms) bs=1:推理模型用GPU(默认V100)开batchsize=1测试时,一个样本所需的耗时,保留小数点后两位。推理使用的数据集为**breakfast**。
+* inference time(ms) bs=2:推理模型用GPU(默认V100)开batchsize=1测试时,一个样本所需的耗时,保留小数点后两位。时序动作分割模型一般是全卷积网络,所以训练、测试和推理的batch_size都是1。推理使用的数据集为**breakfast**。
diff --git a/docs/zh-CN/contribute/README.md b/docs/zh-CN/contribute/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c22ca32e4bfb7d434d62fd5f821da80493792cb1
--- /dev/null
+++ b/docs/zh-CN/contribute/README.md
@@ -0,0 +1,5 @@
+# 代码贡献指南
+
+- [1. 如何添加新算法](./add_new_algorithm.md)
+- [2. 配置系统设计解析](./config.md)
+- [3. 如何提pr](./how_to_contribute.md)
diff --git a/docs/zh-CN/contribute/add_new_algorithm.md b/docs/zh-CN/contribute/add_new_algorithm.md
new file mode 100644
index 0000000000000000000000000000000000000000..25218b342089c48452c1362fc02a74677299938f
--- /dev/null
+++ b/docs/zh-CN/contribute/add_new_algorithm.md
@@ -0,0 +1,414 @@
+# 添加新算法
+
+PaddleVideo将一个算法分解为以下几个部分,并对各部分进行模块化处理,方便快速组合出新的算法。
+
+* [1. 数据加载和处理](#1)
+* [2. 网络](#2)
+* [3. 优化器](#3)
+* [4. 训练策略](#4)
+* [5. 指标评估](#5)
+
+示例代码如下:
+```python
+import numpy as np
+import paddle
+from paddle.io import Dataset, DataLoader
+import paddle.nn as nn
+
+# 1. 数据加载和处理
+## 1.2 数据预处理Pipeline
+class ExamplePipeline(object):
+ """ Example Pipeline"""
+ def __init__(self, mean=0, std=1.0):
+ self.mean = mean
+ self.std = std
+
+ def __call__(self, results):
+ data = results['data']
+ norm_data = (data - self.mean) / self.std
+ results['data'] = norm_data
+ return results
+
+## 1.1 数据集类
+class ExampleDataset(Dataset):
+ """ExampleDataset"""
+ def __init__(self):
+ super(ExampleDataset, self).__init__()
+ self.x = np.random.rand(100, 20, 20)
+ self.y = np.random.randint(10, size = (100, 1))
+
+ def __getitem__(self, idx):
+ x_item = self.x[idx]
+ results = {}
+ results['data'] = x_item
+ pipeline = ExamplePipeline()
+ results = pipeline(results)
+ x_item = results['data'].astype('float32')
+ y_item = self.y[idx].astype('int64')
+ return x_item, y_item
+
+ def __len__(self):
+ return self.x.shape[0]
+
+train_dataset = ExampleDataset()
+## 1.3 封装为Dataloader对象
+train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
+
+# 2. 网络
+class ExampleModel(nn.Layer):
+ """Example Model"""
+ def __init__(self):
+ super(ExampleModel, self).__init__()
+ ## 2.1 网络Backbobe
+ self.layer1 = paddle.nn.Flatten(1, -1)
+ self.layer2 = paddle.nn.Linear(400, 512)
+ self.layer3 = paddle.nn.ReLU()
+ self.layer4 = paddle.nn.Dropout(0.2)
+ ## 2.2 网络Head
+ self.layer5 = paddle.nn.Linear(512, 10)
+
+ def forward(self, x):
+ """ model forward"""
+ y = self.layer1(x)
+ y = self.layer2(y)
+ y = self.layer3(y)
+ y = self.layer4(y)
+ y = self.layer5(y)
+ return y
+
+model = ExampleModel()
+model.train()
+
+# 3. 优化器
+optim = paddle.optimizer.Adam(parameters=model.parameters())
+
+epochs = 5
+for epoch in range(epochs):
+ for batch_id, data in enumerate(train_loader()):
+ x_data = data[0]
+ y_data = data[1]
+ predicts = model(x_data)
+
+ ## 2.3 网络Loss
+ loss = paddle.nn.functional.cross_entropy(predicts, y_data)
+
+ acc = paddle.metric.accuracy(predicts, y_data)
+
+ loss.backward()
+ print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy()))
+
+ optim.step()
+ optim.clear_grad()
+```
+上述代码的运行输出日志如下:
+```txt
+epoch: 0, batch_id: 0, loss is: [2.5613842], acc is: [0.]
+epoch: 0, batch_id: 1, loss is: [2.5776138], acc is: [0.1]
+epoch: 0, batch_id: 2, loss is: [2.551022], acc is: [0.1]
+epoch: 0, batch_id: 3, loss is: [2.782001], acc is: [0.]
+epoch: 0, batch_id: 4, loss is: [2.787499], acc is: [0.1]
+```
+将以上代码集成进PaddleVideo的示例pr参考 [#257](https://github.com/PaddlePaddle/PaddleVideo/pull/257)
+
+下面将分别对每个部分进行介绍,并介绍如何在该部分里添加新算法所需模块。
+
+
+
+## 1. 数据加载和处理
+
+数据加载和处理部分由`Dataset类`、`预处理Pipeline`和`Dataloader对象`组成。`Dataset类`是数据集类,其中的`__getitem__`方法定义了每一个视频样本数据的处理方式。`预处理Pipeline`定义了数据预处理步骤,包括视频的读取,解码以及数据增强等操作。`预处理定义的Pipeline`通常在`Dataset类`的`__getitem__`方法中被调用,以完成对视频预处理操作。这一部分在[paddlevideo/loader](../../../paddlevideo/loader)下。 各个文件及文件夹作用说明如下:
+
+```txt
+paddlevideo/loader/
+├── dataset
+│ ├── base.py # Dataset基类
+│ ├── frame.py # 处理Frame格式输入的Dataset类
+│ └── video.py # 处理Video格式输入的Dataset类
+├── pipelines
+│ ├── decode.py # 解码Pipeline,对视频进行解码
+│ ├── sample.py # 抽帧Pipeline,对视频抽帧的方式
+│ ├── augmentations.py # 数据增强Pipeline,包括缩放、裁剪、反转、正则化等
+...
+```
+
+PaddleVideo内置了针对不同数据集的Dataset相关模块,对于没有内置的模块可通过如下步骤添加:
+
+1. 在 [paddlevideo/loader/dataset](../../../paddlevideo/loader/dataset) 文件夹下新建文件,如my_dataset.py。
+2. 在 my_dataset.py 文件内添加相关代码,示例代码如下:
+
+```python
+@DATASETS.register() # 通过装饰器,自动进行注册
+class MyDataset:
+ def __init__(self, *args, **kwargs):
+ # your init code
+ pass
+
+ def load_file(self):
+ info = []
+ # load file list
+ return info
+
+ def prepare_train(self, idx):
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results) # train pipeline
+ return results['image'], results['labels'] #return your data item
+
+ def prepare_test(self, idx):
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results) # test pipeline
+ return results['image'], results['labels'] #return your data item
+```
+
+3. 在 [paddlevideo/loader/dataset/\_\_init\_\_.py](../../../paddlevideo/loader/dataset/__init__.py) 文件内导入添加的模块。
+
+最后在config文件中指定Dataset类名即可使用。如:
+
+```yaml
+# Define your Dataset name and args
+DATASET:
+ batch_size: 16 # single-card bacth size
+ num_workers: 4 # the number of subprocess on each GPU.
+ train:
+ format: "FrameDataset" # Dataset class
+ data_prefix: "data/k400/rawframes" # train data root path
+ file_path: "data/k400/train_frames.list" # train data list file path
+ suffix: 'img_{:05}.jpg'
+ valid:
+ format: "FrameDataset" # Dataset class
+ data_prefix: "data/k400/rawframes" # valid data root path
+ file_path: "data/k400/train_frames.list" # valid data list file path
+ suffix: 'img_{:05}.jpg'
+ test:
+ format: "FrameDataset" # Dataset class
+ data_prefix: "data/k400/rawframes" # test data root path
+ file_path: "data/k400/train_frames.list" # test data list file path
+ suffix: 'img_{:05}.jpg'
+```
+
+- 关于模块注册机制的详细说明,可以参考[配置系统设计](./config.md)
+
+PaddleVideo内置了大量视频编解码及图像变换相关模块,对于没有内置的模块可通过如下步骤添加:
+
+1. 在 [paddlevideo/loader/pipelines](../../../paddlevideo/loader/pipelines) 文件夹下新建文件,如my_pipeline.py。
+2. 在 my_pipeline.py 文件内添加相关代码,示例代码如下:
+
+```python
+@PIPELINES.register() # 通过装饰器,自动进行注册
+class MyPipeline:
+ def __init__(self, *args, **kwargs):
+ # your init code
+ pass
+
+ def __call__(self, results):
+ img = results['image']
+ label = results['label']
+ # your process code
+
+ results['image'] = img
+ results['label'] = label
+ return results
+```
+
+3. 在 [paddlevideo/loader/pipelines/\_\_init\_\_.py](../../../paddlevideo/loader/pipelines/__init__.py) 文件内导入添加的模块。
+
+数据处理的所有处理步骤由不同的模块顺序执行而成,在config文件中按照列表的形式组合并执行。如:
+
+```yaml
+# Define your pipeline name and args
+PIPELINE:
+ train:
+ decode:
+ name: "FrameDecoder" # Pipeline Class name
+ sample:
+ name: "Sampler" # Pipeline Class name
+ num_seg: 8 # init args
+ seg_len: 1 # init args
+ valid_mode: False # init args
+ transform:
+ - Scale: # Pipeline Class name
+ short_size: 256 # init args
+```
+
+
+
+## 2. 网络
+
+网络部分完成了网络的组网操作,PaddleVideo将网络划分为四三部分,这一部分在[paddlevideo/modeling](../../../paddlevideo/modeling)下。 进入网络的数据将按照顺序(backbones->heads->loss)依次通过这三个部分。backbone用于特征提取,loss通过heads的[loss方法](https://github.com/PaddlePaddle/PaddleVideo/blob/5f7e22f406d11912eef511bafae28c594ccaa07e/paddlevideo/modeling/heads/base.py#L67)被调用。除了损失值,训练过程中如果想观察其它的精度指标(如top1, top5),也可以在head中定义相应的计算方法,参考[get_acc方法](https://github.com/PaddlePaddle/PaddleVideo/blob/5f7e22f406d11912eef511bafae28c594ccaa07e/paddlevideo/modeling/heads/base.py#L122),loss模块最终返回一个[loss字典](https://github.com/PaddlePaddle/PaddleVideo/blob/5f7e22f406d11912eef511bafae28c594ccaa07e/paddlevideo/modeling/heads/base.py#L81),存储loss值以及其它需要的精度指标。
+
+```bash
+├── framework # 组合backbones->heads->loss,定义从输入数据到输出loss的过程
+├── backbones # 网络的特征提取模块
+├── heads # 网络的输出模块
+└── losses # 网络的损失函数模块
+```
+
+PaddleVideo内置了TSN、TSM、SlowFast、ST-GCN、BMN等算法相关的常用模块,对于没有内置的模块可通过如下步骤添加,四个部分添加步骤一致,以backbones为例:
+
+1. 在 [paddlevideo/modeling/backbones](../../../paddlevideo/modeling/backbones) 文件夹下新建文件,如my_backbone.py。
+2. 在 my_backbone.py 文件内添加相关代码,示例代码如下:
+
+```python
+@BACKBONES.register() # 通过装饰器,自动进行注册
+class MyBackbone(nn.Layer):
+ def __init__(self, *args, **kwargs):
+ super(MyBackbone, self).__init__()
+ # your init code
+ self.conv = nn.xxxx
+
+ def forward(self, inputs):
+ # your network forward
+ y = self.conv(inputs)
+ return y
+```
+
+3. 在 [paddlevideo/modeling/backbones/\_\_init\_\_.py](../../../paddlevideo/modeling/backbones/__init__.py)文件内导入添加的模块。
+
+在完成网络的四部分模块添加之后,只需要配置文件中进行配置即可使用,如:
+
+```yaml
+MODEL:
+ framework: "Recognizer2D" # Framework class name
+ backbone:
+ name: "ResNetTweaksTSM" # Backbone class name
+ depth: 50 # init args
+ head:
+ name: "ppTSMHead" # Heads class name
+ num_classes: 400 # init args
+ loss:
+ name: "MyLoss" # Losses class name
+ scale: 0.1 # init args
+```
+
+
+
+## 3. 优化器
+
+优化器用于训练网络。优化器内部还包含了网络正则化和学习率衰减模块。 这一部分在[paddlevideo/solver/](../../../paddlevideo/solver/)下。 PaddleVideo内置了飞桨框架所有的[优化器模块](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/api/paddle/optimizer/Overview_cn.html#api)和[学习率衰减模块](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/api/paddle/optimizer/Overview_cn.html#about-lr)。只需要在配置文件中指定相应模块名称及参数即可方便的调用,示例:
+
+```yaml
+OPTIMIZER:
+ name: 'Momentum' # Optimizer class name
+ momentum: 0.9 # init args
+ learning_rate:
+ name: 'PiecewiseDecay' # Learning rate scheduler class name
+ boundaries: [10, 20] # init args
+ values: [0.001, 0.0001, 0.00001] # init args
+```
+
+对于没有内置的模块可通过如下步骤添加,以`learning rate`为例:
+
+1. 在 [paddlevideo/solver/custom_lr.py](../../../paddlevideo/solver/custom_lr.py) 文件内创建自己的学习率调整策略,示例代码如下:
+
+```python
+class MyLR(LRScheduler):
+ def __init__(self, *args, **kwargs):
+ self.learning_rate = learning_rate
+
+ def step(self, epoch):
+ # learning rate step scheduler
+ self.last_lr = xxx
+
+```
+
+在学习率模块添加之后,只需要配置文件中进行配置即可使用,如:
+
+```yaml
+OPTIMIZER:
+ name: 'Momentum'
+ momentum: 0.9
+ learning_rate:
+ iter_step: True
+ name: 'CustomWarmupCosineDecay' # LR class name
+ max_epoch: 80 # init args
+ warmup_epochs: 10 # init args
+```
+
+
+
+## 4. 训练策略
+
+PaddleVideo内置了很多模型训练相关trick,包括标签平滑、数据增强Mix-up、PreciseBN等,只需要在配置文件中指定相应模块名称及参数即可方便的调用,示例:
+
+```yaml
+
+MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTweaksTSM"
+ head:
+ name: "ppTSMHead"
+ ls_eps: 0.1 # ls_eps字段添加label smooth,并指定平滑系数
+
+MIX:
+ name: "Mixup" # 添加数据增强 Mix-up策略
+ alpha: 0.2 # 指定mix系数
+
+PRECISEBN: # 添加preciseBN策略
+ preciseBN_interval: 5 # 指定prciseBN间隔
+ num_iters_preciseBN: 200 # 指定preciseBN运行的batchs数量
+
+```
+
+训练相关的代码通过[paddlevideo/tasks/train.py](../../../paddlevideo/tasks/train.py)被组织起来,最终被[PaddleVideo/main.py](../../../../PaddleVideo/main.py)调用启动训练,单卡训练和多卡训练的启动方式略有不同。单卡训练启动方式如下:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0 #指定使用的GPU显卡id
+python3.7 main.py --validate -c configs_path/your_config.yaml
+```
+- `--validate` 参数指定训练时运行validation
+- `-c` 参数指定配置文件路径
+- `-o`: 指定重写参数,例如: `-o DATASET.batch_size=16` 用于重写train时batch size大小
+
+多卡训练通过paddle.distributed.launch启动,方式如下:
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+```
+- `--gpus`参数指定使用的GPU显卡id
+- `--log_dir`参数指定日志保存目录
+多卡训练详细说明可以参考[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/guides/02_paddle2.0_develop/06_device_cn.html#danjiduokaxunlian)
+
+
+
+
+## 5. 指标评估
+
+训练完成后,需要进行指标评估,paddlevideo将指标评估模块与训练模块解耦,通过在[PaddleVideo/main.py](../../../../PaddleVideo/main.py)运行时指定`--test`参数调用test模块进行指标评估,评估方法的实现主体在[paddlevideo/metrics/](../../../paddlevideo/metrics)下。 PaddleVideo内置了Uniform、Dense等相关的指标评估模块,对于没有内置的模块可通过如下步骤添加:
+
+1. 在 [paddlevideo/metrics/](../../../paddlevideo/metrics/) 文件夹下新建文件,如my_metric.py。
+2. 在 my_metric.py 文件内添加相关代码,示例代码如下:
+
+```python
+@METRIC.register # 通过装饰器,自动进行注册
+class MyMetric(BaseMetric):
+ def __init__(self, *args, **kwargs):
+ self.top1 = []
+
+ def update(self, batch_id, data, outputs):
+ # update metrics during each iter
+ self.top1.append(xx)
+
+ def accumulate(self):
+ # accumulate metrics when finished all iters.
+ xxx
+ print(np.mean(np.array(self.top1)))
+
+```
+
+3. 在 [paddlevideo/metrics/\_\_init\_\_.py](../../../paddlevideo/metrics/__init__.py)文件内导入添加的模块。
+
+在指标评估模块添加之后,只需要配置文件中进行配置即可使用,如:
+
+```yaml
+METRIC:
+ name: 'CenterCropMetric' # Metric class name
+```
+
+模型测试运行方法如下:
+```bash
+python3.7 main.py --test -c config_path/your_config.yaml -w weight_path/your_weight.pdparams
+```
+- `--test`参数指定运行测试模式
+- `-c`参数指定配置文件
+- `-w`参数指定训练好的权重保存路径
+
diff --git a/docs/zh-CN/contribute/config.md b/docs/zh-CN/contribute/config.md
new file mode 100644
index 0000000000000000000000000000000000000000..a38964c456402ce380b24b9fbddce355f8957153
--- /dev/null
+++ b/docs/zh-CN/contribute/config.md
@@ -0,0 +1,242 @@
+简体中文 | [English](../../en/tutorials/config.md)
+
+# 配置系统设计
+
+---
+
+本文档将介绍PaddleVideo利用依赖注入技术实现控制反转,来对整个系统进行解耦,通过可自定义调整的配置文件来控制整个系统从而实现模块化。最后,介绍了配置文件和PaddleVideo运行时参数的含义。
+
+
+## 设计原则
+
+首先,模型库中会有很多对一个类实例化的操作,例如:
+
+```python
+class TSM():
+ pass
+
+model = TSM(init_attributes)
+```
+当越来越多的实例被创建,这种调用方法和被调用方法间的联系陡然上升,增加了整个系统的耦合性,对启用新功能建设,或是对已用功能扩展产生不便。
+当然我们可以建立一个工厂模式来解决这个问题,根据配置文件的指定输入,来统一的做条件判断:
+
+```python
+if model_name == "TSM":
+ model = TSM()
+elif model_name == "TSN":
+ model = TSN()
+elif ...
+```
+或是像如下代码片段
+
+```python
+optimizer_cfg = dict(name:"MOMENTUM", params: XXX)
+if optimizer_cfg.name = "MOMENTUM":
+ optimizer = MOMENTUM(optimizer_cfg.pop(name))
+elif:
+ ...
+```
+
+可是,越来越多的条件判断被创建出来,还是没有统一彻底的解决这个问题。
+而在其他系统中被广泛利用的 控制反转/依赖注入 技术,PaddleVideo将其利用起来进行系统解耦,并应用到诸如 LOSS METRICS BACKBONE HEAD等场景中。
+PaddleVideo实现了两个组件用于完成控制反转/依赖注入:
+
+- Register, 注册器,用于注册一个模块组件
+- Builder, 用于建立(实例化)一个已注册的组件
+
+1. Register 注册器
+
+PaddleVideo实现了类似setter和getter方法
+
+[source code](../../paddlevideo/utils/registry.py)
+
+```python
+#excerpt from source code.
+class Registry():
+ def __init__(self, name):
+ self._name = name
+ self._obj_map = {}
+
+ #mapping name -> object
+ def register(self, obj, name):
+ self._obj_map[name] = obj
+
+ #get object
+ def get(self, name):
+ ret = self._obj_map.get(name)
+ return ret
+```
+
+用于建立字符串和对象的map,如下的代码将ResNet类注册到BACKBONE map中
+
+```python
+
+ BACKBONES = Registry('backbone')
+ class ResNet:
+ pass
+ BACKBONES.register(ResNet)
+```
+
+或是通过python3语法糖来装饰一个类
+
+```python
+ BACKBONES = Registry('backbone') #new a Register
+ @BACKBONES.register() #regist resnet as a backbone.
+ class ResNet:
+ pass
+```
+
+2. Builder
+
+应用python的反射机制,调用get方法 得到一个已经注册的模块:
+```python
+ # Usage: To build a module.
+
+ backbone_name = "ResNet"
+ b = BACKBONES.get(backbone_name)()
+```
+
+至此,PaddleVideo注册了一个实例,不是在他的调用地方,而是在他的声明处,一个简单的IoC系统建立起来了。
+最后,PaddleVideo 通过这种方式建立了所有组件,并和配置文件参数一一对应。这里,一一对应的含义是:配置文件中的字段,`name` 代表着类的名字,其余字段对应着这个类的初始化参数。当然,除了`name` 我们也应用了别的名字来标记类名,例如:`framework`
+
+```yaml
+head:
+ name: "TSMHead" # class name
+ num_classes: 400 # TSMHead class init attributes
+ ...
+```
+
+---
+
+## 配置参数
+
+配置文件中,有多组字段,如下
+
+- **MODEL:** 代笔模型结构
+- **DATASET:** 数据集和dataloader配置
+- **PIPELINE:** 数据处理流程配置字段
+- **OPTIMIZER:** 优化器字段
+
+和一些共有的参数, 如:
+
+- model_name
+- log_interval
+- epochs
+- resume_epoch
+- log_level
+...
+
+## 模块概览
+
+
+
+
+ |
+ Architectures
+ |
+
+ Frameworks
+ |
+
+ Components
+ |
+
+ Data Augmentation
+ |
+
+
+
+ - Recognition
+
+ - TSN
+ - TSM
+ - SlowFast
+ - PP-TSM
+ - VideoTag
+ - AttentionLSTM
+
+
+
+ |
+
+ Recognizer1D
+ Recognizer2D
+ Recognizer3D
+ Localizer
+
+ Backbone
+ - resnet
+ - resnet_tsm
+ - resnet_tweaks_tsm
+ - bmn
+
+ Head
+ - pptsm_head
+ - tsm_head
+ - tsn_head
+ - bmn_head
+
+
+
+ |
+
+
+ - Loss
+
+ - CrossEntropy
+ - BMNLoss
+
+
+ - Metrics
+
+ - CenterCrop
+ - MultiCrop
+
+
+ |
+
+
+ - Image
+
+ - Scale
+ - Random FLip
+ - Jitter Scale
+ - Crop
+ - MultiCrop
+ - Center Crop
+ - MultiScaleCrop
+ - Random Crop
+ - PackOutput
+
+
+ |
+
+
+
+
+
+
+
+
+---
diff --git a/docs/zh-CN/contribute/how_to_contribute.md b/docs/zh-CN/contribute/how_to_contribute.md
new file mode 100644
index 0000000000000000000000000000000000000000..752d38a66498867f03045c60aa412bd8c9e22a79
--- /dev/null
+++ b/docs/zh-CN/contribute/how_to_contribute.md
@@ -0,0 +1,262 @@
+# PaddleVideo 社区贡献指南
+---
+
+## 目录
+
+- [如何贡献代码](#1)
+ - [1.1 PaddleVideo 分支说明](#1.1)
+ - [1.2 PaddleVideo 代码提交流程与规范](#1.2)
+ - [1.2.1 fork 和 clone 代码](#1.2.1)
+ - [1.2.2 和远程仓库建立连接](#1.2.2)
+ - [1.2.3 创建本地分支](#1.2.3)
+ - [1.2.4 使用 pre-commit 勾子](#1.2.4)
+ - [1.2.5 修改与提交代码](#1.2.5)
+ - [1.2.6 保持本地仓库最新](#1.2.6)
+ - [1.2.7 push到远程仓库](#1.2.7)
+ - [1.2.8 提交Pull Request](#1.2.8)
+ - [1.2.9 签署 CLA 协议和通过单元测试](#1.2.9)
+ - [1.2.10 删除分支](#1.2.10)
+ - [1.2.11 提交代码的一些约定](#1.2.11)
+- [总结](#2)
+- [参考文献](#3)
+
+
+## 一、如何贡献代码
+
+
+### 1.1 PaddleVideo 分支说明
+
+PaddleVideo 未来将维护 2 种分支,分别为:
+
+* release/x.x.x 系列分支:为稳定的发行版本分支,会适时打 tag 发布版本,适配 Paddle 的 release 版本。当前最新的分支为 release/2.2.0 分支。随着版本迭代, release/x.x.x 系列分支会越来越多,默认维护最新版本的 release 分支,其他的分支不再维护。
+* develop 分支:为开发分支,也是默认分支,适配 Paddle 的 develop 版本,主要用于开发新功能。如果有同学需要进行二次开发,请选择 develop 分支。为了保证 develop 分支能在需要的时候拉出 release/x.x.x 分支, develop 分支的代码只能使用 Paddle 最新 release 分支中有效的 api 。也就是说,如果 Paddle develop 分支中开发了新的 api,但尚未出现在 release 分支代码中,那么请不要在 PaddleVideo 中使用。除此之外,对于不涉及 api 的性能优化、参数调整、策略更新等,都可以正常进行开发。
+
+PaddleVideo 的历史分支,未来将不再维护。考虑到一些同学可能仍在使用,这些分支还会继续保留:
+
+* application 分支:这个分支主要存放应用案例相关代码,目前包括VideoTag和FootballAction,后续会将此分支代码迁移至develop分支,并随 release/x.x.x 发版。
+
+
+PaddleVideo 欢迎大家向 repo 中积极贡献代码,下面给出一些贡献代码的基本流程。
+
+
+### 1.2 PaddleVideo 代码提交流程与规范
+
+
+#### 1.2.1 fork 和 clone 代码
+
+* 跳转到 [PaddleVideo GitHub首页](https://github.com/PaddlePaddle/PaddleVideo) ,然后单击 Fork 按钮,生成自己目录下的仓库,比如 `https://github.com/USERNAME/PaddleVideo` 。
+
+
+
+

+
+
+
+* 将远程仓库 clone 到本地
+
+```shell
+# 拉取develop分支的代码
+git clone https://github.com/USERNAME/PaddleVideo.git
+cd PaddleVideo
+```
+
+clone 的地址可以从下面获取
+
+
+

+
+
+
+#### 1.2.2 和远程仓库建立连接
+
+首先通过 `git remote -v` 查看当前远程仓库的信息。
+
+```
+origin https://github.com/USERNAME/PaddleVideo.git (fetch)
+origin https://github.com/USERNAME/PaddleVideo.git (push)
+```
+
+上面的信息只包含了 clone 的远程仓库的信息,也就是自己用户名下的 PaddleVideo ,接下来我们创建一个原始 PaddleVideo 仓库的远程主机,命名为 upstream 。
+
+```shell
+git remote add upstream https://github.com/PaddlePaddle/PaddleVideo.git
+```
+
+使用 `git remote -v` 查看当前远程仓库的信息,输出如下,发现包括了 origin 和 upstream 2 个远程仓库。
+
+```
+origin https://github.com/USERNAME/PaddleVideo.git (fetch)
+origin https://github.com/USERNAME/PaddleVideo.git (push)
+upstream https://github.com/PaddlePaddle/PaddleVideo.git (fetch)
+upstream https://github.com/PaddlePaddle/PaddleVideo.git (push)
+```
+
+这主要是为了后续在提交 pull request (PR) 时,始终保持本地仓库最新。
+
+
+#### 1.2.3 创建本地分支
+
+可以基于当前分支创建新的本地分支,命令如下。
+
+```shell
+git checkout -b new_branch
+```
+
+也可以基于远程或者上游的分支创建新的分支,命令如下。
+
+```shell
+# 基于用户远程仓库(origin)的develop创建new_branch分支
+git checkout -b new_branch origin/develop
+# 基于上游远程仓库(upstream)的develop创建new_branch分支
+# 如果需要从upstream创建新的分支,需要首先使用git fetch upstream获取上游代码
+git checkout -b new_branch upstream/develop
+```
+
+最终会显示切换到新的分支,输出信息如下
+
+```
+Branch new_branch set up to track remote branch develop from upstream.
+Switched to a new branch 'new_branch'
+```
+
+
+#### 1.2.4 使用 pre-commit 勾子
+
+Paddle 开发人员使用 pre-commit 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。
+
+pre-commit 测试是 Travis-CI 中单元测试的一部分,不满足钩子的 PR 不能被提交到 PaddleVideo ,首先安装并在当前目录运行它:
+
+```shell
+pip install pre-commit
+pre-commit install
+```
+
+* **注意**
+
+1. Paddle 使用 clang-format 来调整 C/C++ 源代码格式,请确保 `clang-format` 版本在 3.8 以上。
+2. 通过 `pip install pre-commit` 和 `conda install -c conda-forge pre-commit` 安装的 `yapf` 稍有不同的,PaddleVideo 开发人员使用的是 `pip install pre-commit` 。
+
+
+#### 1.2.5 修改与提交代码
+
+可以通过 `git status` 查看改动的文件。
+对 PaddleVideo 的 `README.md` 做了一些修改,希望提交上去。则可以通过以下步骤
+
+```shell
+git add README.md
+pre-commit
+```
+
+重复上述步骤,直到 pre-comit 格式检查不报错。如下所示。
+
+
+

+
+
+
+使用下面的命令完成提交。
+
+```shell
+git commit -m "your commit info"
+```
+
+
+#### 1.2.6 保持本地仓库最新
+
+获取 upstream 的最新代码并更新当前分支。这里的 upstream 来自于 1.2 节的`和远程仓库建立连接`部分。
+
+```shell
+git fetch upstream
+# 如果是希望提交到其他分支,则需要从upstream的其他分支pull代码,这里是develop
+git pull upstream develop
+```
+
+
+#### 1.2.7 push到远程仓库
+
+```shell
+git push origin new_branch
+```
+
+
+#### 1.2.8 提交Pull Request
+
+点击 new pull request,选择本地分支和目标分支,如下图所示。在 PR 的描述说明中,填写该 PR 所完成的功能。接下来等待 review ,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。
+
+
+

+
+
+
+#### 1.2.9 签署 CLA 协议和通过单元测试
+
+* 签署 CLA
+在首次向 PaddlePaddle 提交 Pull Request 时,您需要您签署一次 CLA (Contributor License Agreement) 协议,以保证您的代码可以被合入,具体签署方式如下:
+
+1. 请您查看 PR 中的 Check 部分,找到 license/cla ,并点击右侧 detail ,进入 CLA 网站
+2. 点击 CLA 网站中的 `Sign in with GitHub to agree` , 点击完成后将会跳转回您的 Pull Request 页面
+
+
+#### 1.2.10 删除分支
+
+* 删除远程分支
+
+在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。
+
+也可以使用 `git push origin :分支名` 删除远程分支,如:
+
+
+```shell
+git push origin :new_branch
+```
+
+* 删除本地分支
+
+```shell
+# 切换到develop分支,否则无法删除当前分支
+git checkout develop
+
+# 删除new_branch分支
+git branch -D new_branch
+```
+
+
+#### 1.2.11 提交代码的一些约定
+
+为了使官方维护人员在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定:
+
+1)请保证 Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,官方维护人员一般不做评审。
+
+2)提交 Pull Request前:
+
+请注意 commit 的数量。
+
+原因:如果仅仅修改一个文件但提交了十几个 commit ,每个 commit 只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个 commit 才能知道做了哪些修改,且不排除 commit 之间的修改存在相互覆盖的情况。
+
+建议:每次提交时,保持尽量少的 commit ,可以通过 `git commit --amend` 补充上次的 commit 。对已经 Push 到远程仓库的多个 commit ,可以参考 [squash commits after push](https://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed) 。
+
+请注意每个 commit 的名称:应能反映当前 commit 的内容,不能太随意。
+
+3)如果解决了某个 Issue 的问题,请在该 Pull Request 的第一个评论框中加上: `fix #issue_number` ,这样当该 Pull Request 被合并后,会自动关闭对应的 Issue 。关键词包括: close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved ,请选择合适的词汇。详细可参考 [Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages) 。
+
+此外,在回复评审人意见时,请您遵守以下约定:
+
+1)官方维护人员的每一个 review 意见都希望得到回复,这样会更好地提升开源社区的贡献。
+
+- 对评审意见同意且按其修改完的,给个简单的 Done 即可;
+- 对评审意见不同意的,请给出您自己的反驳理由。
+
+2)如果评审意见比较多,
+
+- 请给出总体的修改情况。
+- 请采用 `start a review` 进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。
+
+
+## 二、总结
+
+* 开源社区依赖于众多开发者与用户的贡献和反馈,在这里感谢与期待大家向 PaddleVideo 提出宝贵的意见与 Pull Request ,希望我们可以一起打造一个领先实用全面的视频理解代码仓库!
+
+
+## 三、参考文献
+1. [PaddlePaddle本地开发指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/10_contribution/local_dev_guide_cn.html)
+2. [向开源框架提交pr的过程](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh_CN/advanced_tutorials/how_to_contribute.md)
diff --git a/docs/zh-CN/dataset/AVA.md b/docs/zh-CN/dataset/AVA.md
new file mode 100644
index 0000000000000000000000000000000000000000..a30f5d8c3bbb30dc9865306edfa93d108535d919
--- /dev/null
+++ b/docs/zh-CN/dataset/AVA.md
@@ -0,0 +1,105 @@
+[English](../../en/dataset/AVA.md) | 简体中文
+# AVA数据准备
+此文档主要介绍AVA数据集的相关准备流程。主要介绍 AVA数据集的视频文件下载,标注文件准备,视频文件切分
+视频文件提取帧数据,以及拉取提名文件等。在开始之前,请把当前工作目录设定在 `$PaddleVideo/data/ava/shell`
+
+---
+
+## 1. 视频数据下载
+想要获取更多有关AVA数据集的信息,您可以访问其官方网站[AVA](https://research.google.com/ava/index.html).
+至于数据集下载,您可以参看考[AVA Download](https://github.com/cvdfoundation/ava-dataset) ,该Repo详细介绍了AVA视频数据的下载方法.
+我们也提供了视频文件的下载脚本:
+
+```shell
+bash download_videos.sh
+```
+
+为了方便用户,我们将视频文件以zip包的形式上传到百度网盘,您可以直接进行下载 [Link]() coming soon.
+
+
+**注意: 您自己下载的视频文件应当被放置在`data/ava/videos`文件夹下**
+
+---
+## 2.准备标注文件
+
+接下来,您可以使用下面的脚本来准备标注文件
+
+```shell
+bash download_annotations.sh
+```
+
+该脚本会默认下载`ava_v2.1.zip`,如果您想下载`v2.2`,您可以使用:
+
+```shell
+VERSION=2.2 bash download_annotations.sh
+```
+
+**注意:事实上,我们也同样在百度网盘中提供了该标注文件,所以您无需自己下载**
+
+---
+## 3. 切分视频文件
+
+以帧率30fps,切分视频文件从第15分钟到第30分钟
+
+```shell
+bash cut_videos.sh
+```
+---
+
+## 4. 提取RGB帧
+
+您可以通过以下的脚本使用`ffmpeg`来提取RGB帧.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+---
+
+## 5.拉取提名文件
+
+这个脚本来自于Facbook研究院[Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks).
+您可以使用如下的脚本来获取预计算的提名文件列表。
+
+```shell
+bash fetch_ava_proposals.sh
+```
+
+---
+## 6.目录结构
+
+经过整个AVA数据处理流程后,您可以获得AVA的帧文件,视频文件和标注文件
+
+整个项目(AVA)的目录结构如下所示:
+
+```
+PaddleVideo
+├── configs
+├── paddlevideo
+├── docs
+├── tools
+├── data
+│ ├── ava
+│ │ ├── annotations
+│ │ | ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl
+│ │ | ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl
+│ │ | ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl
+│ │ | ├── ava_train_v2.1.csv
+│ │ | ├── ava_val_v2.1.csv
+│ │ | ├── ava_train_excluded_timestamps_v2.1.csv
+│ │ | ├── ava_val_excluded_timestamps_v2.1.csv
+│ │ | ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt
+│ │ ├── videos
+│ │ │ ├── 053oq2xB3oU.mkv
+│ │ │ ├── 0f39OWEqJ24.mp4
+│ │ │ ├── ...
+│ │ ├── videos_15min
+│ │ │ ├── 053oq2xB3oU.mkv
+│ │ │ ├── 0f39OWEqJ24.mp4
+│ │ │ ├── ...
+│ │ ├── rawframes
+│ │ │ ├── 053oq2xB3oU
+| │ │ │ ├── img_00001.jpg
+| │ │ │ ├── img_00002.jpg
+| │ │ │ ├── ...
+```
\ No newline at end of file
diff --git a/docs/zh-CN/dataset/ActivityNet.md b/docs/zh-CN/dataset/ActivityNet.md
new file mode 100644
index 0000000000000000000000000000000000000000..68a7fd90bd439dfe975b06803b7edf7215669ba0
--- /dev/null
+++ b/docs/zh-CN/dataset/ActivityNet.md
@@ -0,0 +1,80 @@
+[English](../../en/dataset/ActivityNet.md) | 简体中文
+
+# ActivityNet数据准备
+
+- [数据集介绍](#数据集介绍)
+- [数据下载与处理](#数据下载与处理)
+
+## 数据集介绍
+
+ActivityNet是一个用于大规模视频理解任务的数据集,可用于动作定位、动作识别等任务。
+
+
+## 数据下载与处理
+1. BMN模型使用的是处理过后的ActivityNet 1.3数据集,有如下两种使用方法:
+ - 使用我们处理好的ActivityNet 1.3数据集(压缩包约5.5G),每一个视频有对应的动作标签、持续区间、持续帧数、持续秒数等信息
+ 使用以下命令下载:
+ ```bash
+ wget https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz # 下载处理好的视频特征数据
+ wget https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json # 下载处理好的标签数据
+ ```
+ 或者点击以下超链接下载:
+
+ [视频特征数据](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)
+ [视频特征数据](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)
+
+ 然后解压下下好的视频特征压缩包
+ ```bash
+ tar -xf bmn_feat.tar.gz
+ ```
+
+ - 自行提取特征
+
+ 首先参考[下载说明](https://github.com/activitynet/ActivityNet/tree/master/Crawler)下载原始数据集。在训练此模型时,需要先使用TSN对源文件抽取特征。可以[自行抽取](https://github.com/yjxiong/temporal-segment-networks)视频帧及光流信息,预训练好的TSN模型可从[此处](https://github.com/yjxiong/anet2016-cuhk)下载。
+
+
+ `activitynet_1.3_annotations.json`标签文件内的信息如下所示:
+ ```json
+ {
+ "v_QOlSCBRmfWY": {
+ "duration_second": 82.73,
+ "subset": "training",
+ "duration_frame": 2067,
+ "annotations": [{
+ "segment": [6.195294851794072, 77.73085420904837],
+ "label": "Ballet"
+ }],
+ "feature_frame": 2064
+ },
+ "v_ehGHCYKzyZ8": {
+ "duration_second": 61.718999999999994,
+ "subset": "training",
+ "duration_frame": 1822,
+ "annotations": [{
+ "segment": [43.95990729267573, 45.401932082395355],
+ "label": "Doing crunches"
+ }],
+ "feature_frame": 1808
+ },
+ ...,
+ ...
+ }
+ ```
+ 最终应该能得到`19228`个视频特征npy文件,对应`activitynet_1.3_annotations.json`文件中的`19228`个标签信息。
+
+2. 新建`data/bmn_data`文件夹,再将下载完毕后将视频特征数据解压出来放入该文件夹下,最终应该组织成以下形式:
+ ```
+ PaddleVideo
+ ├── data
+ │ ├── bmn_data
+ │ │ ├── fix_feat_100
+ │ │ │ ├── v___c8enCfzqw.npy
+ │ │ │ ├── v___dXUJsj3yo.npy
+ │ │ │ ├── ...
+ │ │ │
+ │ │ └── activitynet_1.3_annotations.json
+ ```
+
+3. 最后修改配置文件configs/localization/bmn.yaml中的`feat_path`字段指定特征文件夹路径,通过`file_path`字段指定标签文件路径。
+
+
diff --git a/docs/zh-CN/dataset/Oxford_RobotCar.md b/docs/zh-CN/dataset/Oxford_RobotCar.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee12ff8856e552b5c52d4c79b8bec51aa3b8cb16
--- /dev/null
+++ b/docs/zh-CN/dataset/Oxford_RobotCar.md
@@ -0,0 +1,152 @@
+[English](../../en/dataset/Oxford_RobotCar.md) | 简体中文
+
+# Oxford-RobotCar-for-ADDS数据准备
+
+- [数据集简介](#数据集简介)
+- [数据集下载](#数据集下载)
+- [数据预处理](#数据预处理)
+- [1. 图像去畸变](#1-图像去畸变)
+- [2. 动态帧筛选](#2-动态帧筛选)
+- [3. 图像重命名](#3-图像重命名)
+- [4. 白天-伪夜晚图像对准备](#4-白天-伪夜晚图像对准备)
+
+
+## 数据集简介
+
+[Oxford RobotCar Dataset](https://robotcar-dataset.robots.ox.ac.uk/) 是一个大规模自动驾驶数据集, 包含了大量不同自动驾驶场景下的数据.
+
+这里用到的是从原始的Oxford RobotCar数据集中筛选出一部分用于白天-夜晚深度估计的数据, 即Oxford-RobotCar-for-ADDS.
+
+如果您要使用Oxford-RobotCar-for-ADDS, 请引用以下论文:
+```latex
+@article{maddern20171,
+ title={1 year, 1000 km: The oxford robotcar dataset},
+ author={Maddern, Will and Pascoe, Geoffrey and Linegar, Chris and Newman, Paul},
+ journal={The International Journal of Robotics Research},
+ volume={36},
+ number={1},
+ pages={3--15},
+ year={2017},
+ publisher={SAGE Publications Sage UK: London, England}
+}
+```
+```latex
+@inproceedings{liu2021self,
+ title={Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation},
+ author={Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={12737--12746},
+ year={2021}
+}
+```
+
+## 数据集下载
+
+1. 下载序列[2014-12-09](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-09-13-21-02/) 中Bumblebee XB3的左目图像作为白天场景的训练集, 下载好的图像解压在同一文件夹下.
+2. 下载序列[2014-12-16](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-16-18-44-24/) 中Bumblebee XB3的左目图像作为夜晚场景的训练集, 下载好的图像解压在同一文件夹下.
+3. 验证集的图像和深度真值从原始数据集中筛选, 下载地址如下:
+ ```shell
+ https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt
+ https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.001
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.002
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.001
+ https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.002
+ https://videotag.bj.bcebos.com/Data/ADDS/day_val_451.7z
+ https://videotag.bj.bcebos.com/Data/ADDS/day_val_451_gt.7z
+ https://videotag.bj.bcebos.com/Data/ADDS/night_val_411.7z
+ https://videotag.bj.bcebos.com/Data/ADDS/night_val_411_gt.7z
+ ```
+ 附原始未处理数据下载地址:
+ ```shell
+ # 白天数据
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.001
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.002
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.003
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.004
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.005
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.006
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.007
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.008
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.009
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.010
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.011
+ https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.012
+
+ # 夜晚数据
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.001
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.002
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.003
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.004
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.005
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.006
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.007
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.008
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.009
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.010
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.011
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.012
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.013
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.014
+ https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.015
+ ```
+## 数据预处理
+
+#### 1. 图像去畸变
+
+使用官方提供的工具箱[robotcar-dataset-sdk](https://github.com/ori-mrg/robotcar-dataset-sdk/tree/master/python) 对序列2014-12-09和2014-12-16的图像完成去畸变.
+
+
+#### 2. 动态帧筛选
+
+由于我们使用自监督的方法, 需要筛选出动态帧用于训练. 筛选原则为帧间位姿变化大于0.1m则认为是动态帧. 经过筛选后获得训练集的序列.
+
+
+#### 3. 图像重命名
+
+将原始图像时间戳重命名为连续数字序列. 白天场景对应关系见[1209_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt), 夜晚场景对应关系见[1216_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt). 重命名后的数据格式如下:
+```
+├── oxford_processing
+ ├── day_train_all #白天训练图像文件夹 (day_train_all.7z.001 ~ day_train_all.7z.012)
+ ├── night_train_all #夜晚训练图像文件夹 (night_train_all.7z.001 ~ day_train_all.7z.015)
+ ├── day_val_451 #白天验证图像文件夹 (day_val_451.7z)
+ ├── day_val_451_gt #白天验证深度真值文件夹 (day_val_451_gt.7z)
+ ├── night_val_411 #夜晚验证图像文件夹 (night_val_411.7z)
+ └── night_val_411_gt #夜晚验证深度真值文件夹 (night_val_411_gt.7z)
+```
+
+其中用于训练和验证的序列如下:
+
+```
+splits/oxford_day/train_files.txt # 白天训练序列
+splits/oxford_night/train_files.txt # 夜晚训练序列
+splits/oxford_day_451/val_files.txt # 白天验证序列
+splits/oxford_night_411/val_files.txt # 夜晚验证序列
+```
+训练所用路径文本的下载地址:
+```shell
+https://videotag.bj.bcebos.com/Data/ADDS/train_files.txt
+https://videotag.bj.bcebos.com/Data/ADDS/val_day_files.txt
+https://videotag.bj.bcebos.com/Data/ADDS/val_night_files.txt
+```
+
+#### 4. 白天-伪夜晚图像对准备
+
+为了用我们的框架提取出白天和夜晚图像的共有信息,我们用[CycleGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix)生成白天-伪夜晚图像对,其中伪夜晚为CycleGAN生成的与白天对应的夜晚图像, 所有图像都缩放为192x640, 夜晚图像用直方图均衡化增强, 训练75个epoch, 最终得到Oxford-RobotCar-for-ADDS. 生成的白天-伪夜晚图像对数据格式如下,可直接用于ADDS-DepthNet的训练和验证:
+```
+data
+└── oxford
+ ├── splits
+ ├── train_files.txt
+ ├── val_day_files.txt
+ └── val_night_files.txt
+ └── oxford_processing_forADDS
+ ├── day_train_all/ #白天训练图像文件夹 (解压自day_train_all.7z.001 ~ day_train_all.7z.002)
+ ├── night_train_all/ #夜晚训练图像文件夹 (解压自night_train_all.7z.001 ~ day_train_all.7z.002)
+ ├── day_val_451/ #白天验证图像文件夹 (解压自day_val_451.7z)
+ ├── day_val_451_gt/ #白天验证深度真值文件夹 (解压自day_val_451_gt.7z)
+ ├── night_val_411/ #夜晚验证图像文件夹 (解压自night_val_411.7z)
+ └── night_val_411_gt/ #夜晚验证深度真值文件夹 (解压自night_val_411_gt.7z)
+```
+
+其中用于训练和验证的序列与前述保持一致.
diff --git a/docs/zh-CN/dataset/SegmentationDataset.md b/docs/zh-CN/dataset/SegmentationDataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..c43b5910f302bedf0e7d8e772feafa8bee8d5906
--- /dev/null
+++ b/docs/zh-CN/dataset/SegmentationDataset.md
@@ -0,0 +1,35 @@
+简体中文 | [English](../../en/dataset/SegmentationDataset.md)
+
+# 视频动作分割模型数据使用说明
+
+视频动作分割模型使用breakfast、50salads和gtea数据集,使用方法为使用预训练模型提取的特征,可以从MS-TCN官方代码库中获取。[feat](https://zenodo.org/record/3625992#.Xiv9jGhKhPY)
+
+- 数据集文件树形式
+```txt
+─── gtea
+ ├── features
+ │ ├── S1_Cheese_C1.npy
+ │ ├── S1_Coffee_C1.npy
+ │ ├── S1_CofHoney_C1.npy
+ │ └── ...
+ ├── groundTruth
+ │ ├── S1_Cheese_C1.txt
+ │ ├── S1_Coffee_C1.txt
+ │ ├── S1_CofHoney_C1.txt
+ │ └── ...
+ ├── splits
+ │ ├── test.split1.bundle
+ │ ├── test.split2.bundle
+ │ ├── test.split3.bundle
+ │ └── ...
+ └── mapping.txt
+```
+
+- 数据集存放文件树形式
+```txt
+─── data
+ ├── 50salads
+ ├── breakfast
+ ├── gtea
+ └── ...
+```
diff --git a/docs/zh-CN/dataset/fsd.md b/docs/zh-CN/dataset/fsd.md
new file mode 100644
index 0000000000000000000000000000000000000000..670cb694f3dafefe0dc4b16705cd1f817e95cfd9
--- /dev/null
+++ b/docs/zh-CN/dataset/fsd.md
@@ -0,0 +1,56 @@
+[English](../../en/dataset/fsd.md) | 简体中文
+
+# 基于飞桨实现花样滑冰选手骨骼点动作识别大赛数据准备
+
+- [数据集介绍](#数据集介绍)
+- [数据下载](#数据下载)
+
+---
+
+
+## 数据集介绍
+
+基于飞桨实现花样滑冰选手骨骼点动作识别大赛数据集旨在通过花样滑冰研究人体的运动。在花样滑冰运动中,人体姿态和运动轨迹相较于其他运动呈现复杂性强、类别多的特点,有助于细粒度图深度学习新模型、新任务的研究。
+
+
+在FSD-10 中,所有的视频素材从2017 到2018 年的花样滑冰锦标赛中采集。源视频素材中视频的帧率被统一标准化至每秒30 帧,并且图像大小是1080 * 720 来保证数据集的相对一致性。之后我们通过2D姿态估计算法Open Pose对视频进行逐帧骨骼点提取,最后以.npy格式保存数据集。
+
+训练数据集与测试数据集的目录结构如下所示:
+
+```txt
+train_data.npy # 2922
+train_label.npy # 2922
+test_A_data.npy # 628
+test_B_data.npy # 634
+```
+
+其中train_label.npy通过np.load()读取后会得到一个一维张量,每一个元素为一个值在0-29之间的整形变量代表动作的标签;data.npy文件通过np.load()读取后,会得到一个形状为N×C×T×V×M的五维张量,每个维度的具体含义如下:
+
+| 维度符号 | 维度值大小 | 维度含义 | 补充说明 |
+| :---- | :----: | :----: | :---- |
+| N | 样本数 | 代表N个样本 | 无 |
+| C | 3 | 分别代表每个关节点的x, y坐标和置信度 | 每个x,y均被放缩至-1到1之间 |
+| T | 1500 | 代表动作的持续时间长度,共有1500帧 | 有的动作的实际长度可能不足1500,例如可能只有500的有效帧数,我们在其后重复补充0直到1500帧,来保证T维度的统一性 |
+| V | 25 | 代表25个关节点 | 具体关节点的含义可看下方的骨架示例图 |
+| M | 1 | 代表1个运动员个数 | 无 |
+
+骨架示例图:
+
+
+
+

+
+
+
+
+## 数据下载
+
+在[2021 CCF BDCI 基于飞桨实现花样滑冰选手骨骼点动作识别比赛](https://aistudio.baidu.com/aistudio/competition/detail/115/0/introduction)主页报名后即可获取下载链接
+
+| 数据集 | Data | Label |
+| :---- | :----: | :----: |
+| 训练集 | [train_data.npy](https://videotag.bj.bcebos.com/Data/FSD_train_data.npy) | [train_label.npy](https://videotag.bj.bcebos.com/Data/FSD_train_label.npy) |
+| 测试集A | comming soon | comming soon |
+
+
+> 由于版权原因,RGB数据暂不开放。
diff --git a/docs/zh-CN/dataset/howto100m.md b/docs/zh-CN/dataset/howto100m.md
new file mode 100644
index 0000000000000000000000000000000000000000..63711a4c8b61d7482a28b8324dd7319abe887f64
--- /dev/null
+++ b/docs/zh-CN/dataset/howto100m.md
@@ -0,0 +1,31 @@
+# HowTo100M 数据准备
+
+HowTo100M 数据相关准备,包括HowTo100M数据下载和数据下载后文件组织结构。
+
+## 数据下载
+
+HowTo100M 从1.2M Youtube 教学视频中切分出136M包含字幕的视频片段,涵盖23k活动类型,包括做饭、手工制作、日常护理、园艺、健身等等,数据集约10T大小。
+
+因为完整数据集体积过大,这里我们只提供少量数据,供大家跑通训练前向。如需下载全量数据,请参考:[HowTo100M](https://www.di.ens.fr/willow/research/howto100m/)
+
+为了方便使用,我们提供的数据版本已对HowTo100M数据集中的物体特征和动作特征进行了特征提取。
+
+首先,请确保在 `data/howto100m` 目录下,输入如下命令,下载数据集。
+
+```bash
+bash download_features.sh
+```
+
+下载完成后,data目录下文件组织形式如下:
+
+```
+├── data
+| ├── howto100m
+| │ ├── actbert_train_data.npy
+| │ ├── caption_train.json
+| | ├── caption_val.json
+
+```
+
+## 参考论文
+- Antoine Miech, Dimitri Zhukov, Jean-Baptiste Alayrac, Makarand Tapaswi, Ivan Laptev, and Josef Sivic. Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In ICCV, 2019.
diff --git a/docs/zh-CN/dataset/k400.md b/docs/zh-CN/dataset/k400.md
new file mode 100644
index 0000000000000000000000000000000000000000..7eeceacb2dc590a68520d18008d025d057900ab4
--- /dev/null
+++ b/docs/zh-CN/dataset/k400.md
@@ -0,0 +1,77 @@
+[English](../../en/dataset/k400.md) | 简体中文
+
+# Kinetics-400 数据准备
+
+- [数据集介绍](#数据集介绍)
+- [下载video数据](#下载video数据)
+- [提取frames数据](#提取frames数据)
+
+---
+
+
+## 数据集介绍
+
+Kinetics-400是视频领域benchmark常用数据集,详细介绍可以参考其官方网站[Kinetics](https://deepmind.com/research/open-source/kinetics)。下载方式可参考官方地址[ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics),使用其提供的下载脚本下载数据集。
+
+## 下载video数据
+
+考虑到K400数据集下载困难的问题,我们提供了两种下载方式: (1) 百度网盘下载 (2) 脚本下载
+
+### 百度网盘下载
+
+网盘链接:https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg
+提取码:ppvi
+
+### 脚本下载
+
+- 下载训练集链接列表文件[train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list)和验证集链接列表文件[val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list)。
+
+编写下载脚本`download.sh`如下:
+```bash
+file=$1
+
+while read line
+do
+ wget "$line"
+done <$file
+```
+
+下载训练集命令:
+```bash
+bash download.sh train_link.list
+```
+
+下载验证集命令:
+```bash
+bash download.sh val_link.list
+```
+
+---
+
+|类别 | 数据条数 | list文件 |
+| :------: | :----------: | :----: |
+|训练集 | 234619 | [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|
+|验证集 | 19761 | [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|
+
+- 下载后自行解压,并将数据路径添加到相应的list文件中。
+
+- 由于部分视频原始链接失效,数据有部分缺失,全部文件大概需要135G左右的存储空间,PaddleVideo使用的也是这份数据。
+
+> 此份数据仅限于学术研究,若对您有帮助,欢迎给[项目](https://github.com/PaddlePaddle/PaddleVideo)star~
+
+
+## 提取frames数据
+为了加速网络的训练过程,我们首先对视频文件(K400视频文件为mp4格式)提取帧 (frames)。相对于直接通过视频文件进行网络训练的方式,frames的方式能够极大加快网络训练的速度。
+
+输入如下命令,即可提取K400视频文件的frames
+
+```python
+python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4
+```
+
+视频文件frames提取完成后,会存储在指定的`./rawframes`路径下,大小约为2T左右。
+
+|类别 | 数据条数 | list文件 |
+| :------: | :----------: | :----: |
+|训练集 | 234619 | [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|
+|验证集 | 19761 | [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|
diff --git a/docs/zh-CN/dataset/msrvtt.md b/docs/zh-CN/dataset/msrvtt.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2cfdedc305cf2fe4006ad54ca9b13e7d548b123
--- /dev/null
+++ b/docs/zh-CN/dataset/msrvtt.md
@@ -0,0 +1,72 @@
+[English](../../en/dataset/msrvtt.md) | 简体中文
+
+# MSR-VTT 数据准备
+
+- [数据集介绍](#数据集介绍)
+- [T2VLAD模型数据准备](#T2VLAD模型数据准备)
+- [ActBERT模型数据准备](#T2VLAD模型数据准备)
+- [参考文献](#参考文献)
+
+## 数据集介绍
+
+MSR-VTT(Microsoft Research Video to Text) 是一个包含视频及字幕的大规模数据集,由来自20个类别的10,000个视频片段组成,每个视频片段由20个英文句子注释。我们使用9000个视频片段用于训练,1000个用于测试。更多详细信息可以参考网站:[MSRVTT](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)
+
+## T2VLAD模型数据准备
+[T2VLAD模型文档](../../../applications/T2VLAD/README.md)
+
+为了方便使用,我们提供的数据版本已对MSR-VTT数据集中对视频进行了特征提取。
+
+首先,请确保在 `applications/T2VLAD/data` 目录下,输入如下命令,下载数据集。
+
+```bash
+bash download_features.sh
+```
+
+下载完成后,data目录下文件组织形式如下:
+
+```
+├── data
+| ├── MSRVTT
+| │ ├── raw-captions.pkl
+| │ ├── train_list_jsfusion.txt
+| │ ├── val_list_jsfusion.txt
+| │ ├── aggregated_text_feats
+| | | ├── w2v_MSRVTT_openAIGPT.pickle
+| | ├── mmt_feats
+| │ │ ├── features.audio.pkl
+| │ │ ├── features.face_agg.pkl
+| │ │ ├── features.flos_agg.pkl
+| │ │ ├── features.ocr.pkl
+| │ │ ├── features.rgb_agg.pkl
+| │ │ ├── features.s3d.pkl
+| │ │ ├── features.scene.pkl
+| │ │ ├── features.speech.pkl
+
+```
+
+## ActBERT模型数据准备
+[ActBERT模型文档](../model_zoo/multimodal/actbert.md)
+
+下载数据特征:
+```
+wget https://videotag.bj.bcebos.com/Data/ActBERT/msrvtt_test.lmdb.tar
+wget https://videotag.bj.bcebos.com/Data/ActBERT/MSRVTT_JSFUSION_test.csv
+```
+
+将下载得到的`msrvtt_test.lmdb.tar`解压:
+```
+tar -zxvf msrvtt_test.lmdb.tar
+```
+
+最终得到的文件组织形式如下:
+```
+├── data
+| ├── MSR-VTT
+| │ ├── MSRVTT_JSFUSION_test.csv
+| │ ├── msrvtt_test.lmdb
+| │ ├── data.mdb
+| │ ├── lock.mdb
+```
+
+## 参考论文
+- Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. Multi-modal transformer for video retrieval. In ECCV, 2020.
diff --git a/docs/zh-CN/dataset/ntu-rgbd.md b/docs/zh-CN/dataset/ntu-rgbd.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0910fa48ca07f7b4cd5f05e92932b1276e60356
--- /dev/null
+++ b/docs/zh-CN/dataset/ntu-rgbd.md
@@ -0,0 +1,158 @@
+[English](../../en/dataset/ntu-rgbd.md) | 简体中文
+
+# NTU-RGB+D 数据准备
+
+- [数据集介绍](#数据集介绍)
+- [ST-GCN数据集准备](#ST-GCN数据集准备)
+- [CTR-GCN数据集准备](#CTR-GCN数据集准备)
+
+---
+
+
+## 数据集介绍
+
+NTU-RGB+D是基于骨骼的行为识别数据集,包含60个种类的动作,56880个样本,详细介绍可以参考其官方网站[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/)。该数据集在划分训练集和测试集时采用了两种不同的划分标准。Cross-Subject按照人物ID划分,训练集40320个样本,测试集16560个样本。Cross-View安装相机划分,相机2和3采集的样本为训练集,包含37930个样本,相机1采集的样本为测试集,包含18960个样本。
+
+
+## ST-GCN数据集准备
+
+以下是ST-GCN模型的数据集准备流程介绍。
+
+### 数据集下载
+
+我们提供处理好的数据集下载地址[NTU-RGB-D.tar](https://videotag.bj.bcebos.com/Data/NTU-RGB-D.tar)(~3.1G),下载后通过命令```tar -zxvf NTU-RGB-D.tar ```进行解压,得到的数据目录如下:
+
+```txt
+─── NTU-RGB-D
+ ├── xsub
+ │ ├── train_data.npy
+ │ ├── train_label.pkl
+ │ ├── val_data.npy
+ │ └── val_label.pkl
+ └── xview
+ ├── train_data.npy
+ ├── train_label.pkl
+ ├── val_data.npy
+ └── val_label.pkl
+```
+
+> 数据来源于[st-gcn](https://github.com/open-mmlab/mmskeleton/blob/master/doc/SKELETON_DATA.md)。
+
+## CTR-GCN数据集准备
+
+以下是CTR-GCN模型的数据集准备流程介绍。
+
+### 数据集下载
+
+在`data\ntu-rgb-d`目录有下载其官方网站[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/)提供的数据集的脚本`download_dataset.sh`
+
+```bash
+sh data/ntu-rgb-d/download_dataset.sh
+```
+
+运行脚本后会得到如下的数据目录:
+```txt
+─── ntu-rgb-d
+ ├── download_dataset.sh
+ ├── nturgb+d_skeletons
+ │ ├── S001C001P001R001A001.skeleton
+ │ ├── S001C001P001R001A002.skeleton
+ │ ├── S001C001P001R001A003.skeleton
+ │ ├── S001C001P001R001A004.skeleton
+ │ ├── S001C001P001R001A005.skeleton
+ │ ├── S001C001P001R001A006.skeleton
+ │ ├── S001C001P001R001A007.skeleton
+ │ ├── ....
+ │ └── S017C003P020R002A060.skeleton
+ ├── get_raw_denoised_data.py
+ ├── get_raw_skes_data.py
+ ├── seq_transformation.py
+ └── statistics
+ ├── camera.txt
+ ├── label.txt
+ ├── performer.txt
+ ├── replication.txt
+ ├── setup.txt
+ └── skes_available_name.txt
+
+```
+
+### 数据集处理
+
+运行如下脚本,将数据处理成CTR-GCN所需的格式。
+
+> 注:若自定义数据集,提前准备好`data/ntu-rgb-d/statistics/skes_available_name.txt`文件,该文件是待处理的骨骼点数据文件名清单。
+
+```bash
+cd ./data/ntu-rgb-d
+# Get skeleton of each performer
+python get_raw_skes_data.py
+# Remove the bad skeleton
+python get_raw_denoised_data.py
+# Transform the skeleton to the center of the first frame
+python seq_transformation.py
+```
+
+最终数据集处理后得到如下文件树形式
+
+```txt
+─── ntu-rgb-d
+ ├── download_dataset.sh
+ ├── nturgb+d_skeletons
+ │ ├── S001C001P001R001A001.skeleton
+ │ ├── S001C001P001R001A002.skeleton
+ │ ├── S001C001P001R001A003.skeleton
+ │ ├── S001C001P001R001A004.skeleton
+ │ ├── S001C001P001R001A005.skeleton
+ │ ├── S001C001P001R001A006.skeleton
+ │ ├── S001C001P001R001A007.skeleton
+ │ ├── ....
+ │ └── S017C003P020R002A060.skeleton
+ ├── denoised_data
+ │ ├── actors_info
+ │ │ ├── S001C001P001R001A024.txt
+ │ │ ├── S001C001P001R001A025.txt
+ │ │ ├── S001C001P001R001A026.txt
+ │ │ ├── ....
+ │ │ ├── S017C003P020R002A059.txt
+ │ │ └── S017C003P020R002A060.txt
+ │ ├── denoised_failed_1.log
+ │ ├── denoised_failed_2.log
+ │ ├── frames_cnt.txt
+ │ ├── missing_skes_1.log
+ │ ├── missing_skes_2.log
+ │ ├── missing_skes.log
+ │ ├── noise_length.log
+ │ ├── noise_motion.log
+ │ ├── noise_spread.log
+ │ ├── raw_denoised_colors.pkl
+ │ ├── raw_denoised_joints.pkl
+ │ └── rgb+ske
+ ├── raw_data
+ │ ├── frames_cnt.txt
+ │ ├── frames_drop.log
+ │ ├── frames_drop_skes.pkl
+ │ └── raw_skes_data.pkl
+ ├── get_raw_denoised_data.py
+ ├── get_raw_skes_data.py
+ ├── seq_transformation.py
+ ├── statistics
+ │ ├── camera.txt
+ │ ├── label.txt
+ │ ├── performer.txt
+ │ ├── replication.txt
+ │ ├── setup.txt
+ │ └── skes_available_name.txt
+ ├── xview
+ │ ├── train_data.npy
+ │ ├── train_label.pkl
+ │ ├── val_data.npy
+ │ └── val_label.pkl
+ └── xsub
+ ├── train_data.npy
+ ├── train_label.pkl
+ ├── val_data.npy
+ └── val_label.pkl
+```
+
+> 注:文件夹`denoised_data`、`raw_data`和`nturgb+d_skeletons`都为处理处理的临时文件,可在提取出`xview`和`xsub`后删除。
diff --git a/docs/zh-CN/dataset/ucf101.md b/docs/zh-CN/dataset/ucf101.md
new file mode 100644
index 0000000000000000000000000000000000000000..83b422d80d091ea4913c01e6ab3baab5b31ba932
--- /dev/null
+++ b/docs/zh-CN/dataset/ucf101.md
@@ -0,0 +1,93 @@
+# UCF101数据准备
+UCF101数据的相关准备。主要包括UCF101的video文件下载,video文件提取frames,以及生成文件的路径list。
+
+---
+## 1. 数据下载
+UCF101数据的详细信息可以参考网站[UCF101](https://www.crcv.ucf.edu/data/UCF101.php)。 为了方便使用,PaddleVideo提供了UCF101数据的annotations文件和videos文件的下载脚本。
+
+### 下载annotations文件
+首先,请确保在[data/ucf101/ 目录](../../../data/ucf101)下,输入如下UCF101数据集的标注文件的命令。
+```shell
+bash download_annotations.sh
+```
+
+### 下载UCF101的视频文件
+同样需要确保在[data/ucf101/ 目录](../../../data/ucf101)下,输入下述命令下载视频文件
+
+```shell
+bash download_videos.sh
+```
+- 运行该命令需要安装unrar解压工具,可使用pip方式安装。
+
+- 下载完成后视频文件会存储在[data/ucf101/videos/ 文件夹](../../../data/ucf101/videos)下,视频文件大小为6.8G。
+
+---
+## 2. 提取视频文件的frames
+为了加速网络的训练过程,我们首先对视频文件(ucf101视频文件为avi格式)提取帧 (frames)。相对于直接通过视频文件进行网络训练的方式,frames的方式能够加快网络训练的速度。
+
+直接输入如下命令,即可提取ucf101视频文件的frames
+
+``` python
+python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext avi
+```
+
+视频文件frames提取完成后,会存储在`./rawframes`文件夹下,大小为56G。
+
+---
+## 3. 生成frames文件和视频文件的路径list
+生成视频文件的路径list,输入如下命令
+
+```python
+python build_ucf101_file_list.py videos/ --level 2 --format videos --out_list_path ./
+```
+生成frames文件的路径list,输入如下命令:
+```python
+python build_ucf101_file_list.py rawframes/ --level 2 --format rawframes --out_list_path ./
+```
+
+**参数说明**
+
+`videos/` 或者 `rawframes/` : 表示视频或者frames文件的存储路径
+
+`--level 2` : 表示文件的存储结构
+
+`--format`: 表示是针对视频还是frames生成路径list
+
+`--out_list_path `: 表示生成的路径list文件存储位置
+
+
+# 以上步骤完成后,文件组织形式如下所示
+
+```
+├── data
+| ├── dataset
+| │ ├── ucf101
+| │ │ ├── ucf101_{train,val}_split_{1,2,3}_rawframes.txt
+| │ │ ├── ucf101_{train,val}_split_{1,2,3}_videos.txt
+| │ │ ├── annotations
+| │ │ ├── videos
+| │ │ │ ├── ApplyEyeMakeup
+| │ │ │ │ ├── v_ApplyEyeMakeup_g01_c01.avi
+| │ │ │ │ └── ...
+| │ │ │ ├── YoYo
+| │ │ │ │ ├── v_YoYo_g25_c05.avi
+| │ │ │ │ └── ...
+| │ │ │ └── ...
+| │ │ ├── rawframes
+| │ │ │ ├── ApplyEyeMakeup
+| │ │ │ │ ├── v_ApplyEyeMakeup_g01_c01
+| │ │ │ │ │ ├── img_00001.jpg
+| │ │ │ │ │ ├── img_00002.jpg
+| │ │ │ │ │ ├── ...
+| │ │ │ │ │ ├── flow_x_00001.jpg
+| │ │ │ │ │ ├── flow_x_00002.jpg
+| │ │ │ │ │ ├── ...
+| │ │ │ │ │ ├── flow_y_00001.jpg
+| │ │ │ │ │ ├── flow_y_00002.jpg
+| │ │ │ ├── ...
+| │ │ │ ├── YoYo
+| │ │ │ │ ├── v_YoYo_g01_c01
+| │ │ │ │ ├── ...
+| │ │ │ │ ├── v_YoYo_g25_c05
+
+```
diff --git a/docs/zh-CN/dataset/youtube8m.md b/docs/zh-CN/dataset/youtube8m.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0a62680a8caaa63c69810164f8703fdff2b3523
--- /dev/null
+++ b/docs/zh-CN/dataset/youtube8m.md
@@ -0,0 +1,59 @@
+[English](../../en/dataset/youtube8m.md) | 简体中文
+
+# YouTube-8M数据准备
+
+- [数据集简介](#数据集简介)
+- [数据集下载](#数据集下载)
+- [数据格式转化](#数据格式转化)
+
+
+## 数据集简介
+
+YouTube-8M 是一个大规模视频分类数据集,包含800多万个视频url,标签体系涵盖3800多种知识图谱实体,1个视频对应多个标签(平均3-4个),使用机器进行标注。
+
+**每个视频的长度在120s到500s之间
+由于视频数据量太大,因此预先使用图像分类模型提取了frame-level的特征,并使用PCA对特征进行了降维处理得到多帧1024维的特征,类似地用音频模型处理得到多帧128维的音频特征。**
+> 这里用到的是YouTube-8M 2018年更新之后的数据集(May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features)。
+
+
+## 数据集下载
+
+1. 新建存放特征的目录(以PaddleVideo目录下为例)
+ ```bash
+ cd data/yt8m
+ mkdir frame
+ cd frame
+ ```
+2. 下载训练、验证集到frame文件夹中
+ ```bash
+ curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
+ curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
+ ```
+ 下载过程如图所示
+ 
+
+ 数据下载完成后,将会得到3844个训练数据文件和3844个验证数据文件(TFRecord格式)
+
+
+## 数据格式转化
+1. 安装tensorflow-gpu用于读入tfrecord数据
+ ```bash
+ python3.7 -m pip install tensorflow-gpu==1.14.0
+ ```
+3. 将下载的TFRecord文件转化为pickle文件以便PaddlePaddle使用
+ ```bash
+ cd .. # 从frame目录回到yt8m目录
+ python3.7 tf2pkl.py ./frame ./pkl_frame/ # 将frame文件夹下的train*.tfrecord和validate*.tfrecord转化为pkl格式
+ ```
+2. 生成单个pkl文件路径集合,并根据此文件将pkl拆分为多个小pkl文件,并生成最终需要的拆分pkl文件路径
+ ```bash
+ ls pkl_frame/train*.pkl > train.list # 将train*.pkl的路径写入train.list
+ ls pkl_frame/validate*.pkl > val.list # 将validate*.pkl的路径写入val.list
+
+ python3.7 split_yt8m.py train.list # 拆分每个train*.pkl变成多个train*_split*.pkl
+ python3.7 split_yt8m.py val.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
+
+ ls pkl_frame/train*_split*.pkl > train.list # 将train*_split*.pkl的路径重新写入train.list
+ ls pkl_frame/validate*_split*.pkl > val.list # 将validate*_split*.pkl的路径重新写入val.list
+ ```
+
diff --git a/docs/zh-CN/install.md b/docs/zh-CN/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..2f5bd7e187683f78bf3a51b0f13fe9e6114e6abc
--- /dev/null
+++ b/docs/zh-CN/install.md
@@ -0,0 +1,87 @@
+简体中文 | [English](../en/install.md)
+
+# 安装说明
+
+---
+
+- [简介](#简介)
+- [安装PaddlePaddle](#安装PaddlePaddle)
+- [安装PaddleVideo](#安装PaddleVideo)
+
+## 简介
+
+使用PaddleVideo之前,请先安装PaddlePaddle及相关依赖项。
+
+
+## 安装PaddlePaddle
+
+运行PaddleVideo需要`PaddlePaddle 2.0`或更高版本。请参照[安装文档](http://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
+PaddleVideo只支持python3.7及以上的运行环境,依赖项请安装python3.7及以上的安装包
+
+如果已经安装好了cuda、cudnn、nccl或者安装好了nvidia-docker运行环境,可以pip3安装最新GPU版本PaddlePaddle
+
+```bash
+pip3 install paddlepaddle-gpu --upgrade
+```
+
+也可以从源码编译安装PaddlePaddle,请参照[安装文档](http://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
+
+使用以下命令可以验证PaddlePaddle是否安装成功。
+
+```python3
+import paddle
+paddle.utils.run_check()
+```
+
+查看PaddlePaddle版本的命令如下:
+
+```bash
+python3 -c "import paddle; print(paddle.__version__)"
+```
+
+注意:
+- 从源码编译的PaddlePaddle版本号为0.0.0,请确保使用了PaddlePaddle 2.0及之后的源码编译。
+- PaddleVideo基于PaddlePaddle高性能的分布式训练能力,若您从源码编译,请确保打开编译选项,**WITH_DISTRIBUTE=ON**。具体编译选项参考[编译选项表](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3)。
+- 在docker中运行时,为保证docker容器有足够的共享内存用于Paddle的数据读取加速,在创建docker容器时,请设置参数`--shm_size=32g`,条件允许的话可以设置为更大的值。
+
+**运行环境需求:**
+
+- Python3.7 or later version (当前只支持Linux系统)
+- CUDA >= 10.1
+- cuDNN >= 7.6.4
+- nccl >= 2.1.2
+
+
+## 安装PaddleVideo
+
+**克隆PaddleVideo模型库:**
+
+```
+cd path_to_clone_PaddleVideo
+git clone https://github.com/PaddlePaddle/PaddleVideo.git
+cd PaddleVideo
+```
+
+**安装Python依赖库:**
+
+Python依赖库在[requirements.txt](https://github.com/PaddlePaddle/PaddleVideo/blob/master/requirements.txt)中给出,可通过如下命令安装:
+
+```
+python3.7 -m pip install --upgrade pip
+pip3.7 install --upgrade -r requirements.txt
+```
+
+**从python安装包安装PaddleVideo:**
+
+使用pypi安装
+
+```bash
+pip install paddlevideo==0.0.1
+```
+
+安装完成后,可以使用命令行方式启动程序
+```bash
+ppvideo --model_name='ppTSM' --video_file='data/example.avi'
+```
+
+---
diff --git a/docs/zh-CN/model_zoo/README.md b/docs/zh-CN/model_zoo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d5788beb0e878273cf22433c985dc03497bef91
--- /dev/null
+++ b/docs/zh-CN/model_zoo/README.md
@@ -0,0 +1,40 @@
+简体中文 | [English](../../en/model_zoo/README.md)
+
+
+# 概要
+PaddleVideo包含视频分类和动作定位方向的多个主流领先模型,其中TSN, TSM和SlowFast是End-to-End的视频分类模型,Attention LSTM是比较流行的视频特征序列模型,BMN是视频动作定位模型,TransNetV2是视频切分模型。TSN是基于2D-CNN的经典解决方案,TSM是基于时序移位的简单高效视频时空建模方法,SlowFast是FAIR在ICCV2019提出的3D视频分类模型,特征序列模型Attention LSTM速度快精度高。BMN模型是百度自研模型,为2019年ActivityNet夺冠方案。基于百度飞桨产业实践,我们自研并开源了ppTSM,该模型基于TSM进行优化,在保持模型参数量和计算量不增加的前提下,精度得到大幅提升。同时,我们的通用优化策略可以广泛适用于各种视频模型,未来我们将进行更多的模型优化工作,比如TPN、SlowFast、X3D等,敬请期待。
+
+
+## 模型概览
+
+| 领域 | 模型 | 配置 | 测试集 | 精度指标 | 精度% | 下载链接 |
+| :--------------- | :--------: | :------------: | :------------: | :------------: | :------------: | :------------: |
+| 行为识别 | [**PP-TSM**](./recognition/pp-tsm.md) | [pptsm.yaml](../../../configs/recognition/pptsm/pptsm_k400_frames_dense.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 76.16 | [PPTSM.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |
+| 行为识别| [**PP-TSN**](./recognition/pp-tsn.md) | [pptsn.yaml](../../../configs/recognition/pptsn/pptsn_k400_frames.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 75.06 | [PPTSN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |
+| 行为识别 | [**PP-TimeSformer**](./recognition/pp-timesformer.md) | [pptimesformer.yaml](../../../configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |
+| 行为识别 | [AGCN](./recognition/agcn.md) | [agcn.yaml](../../../configs/recognition/agcn/agcn_fsd.yaml) | [FSD](../dataset/fsd.md) | Top-1 | 62.29 | [AGCN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams) |
+| 行为识别 | [ST-GCN](./recognition/stgcn.md) | [stgcn.yaml](../../../configs/recognition/stgcn/stgcn_fsd.yaml) | [FSD](../dataset/fsd.md) | Top-1 | 59.07 | [STGCN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |
+| 行为识别 | [VideoSwin](./recognition/videoswin.md) | [videoswin.yaml](../../../configs/recognition/videoswin/videoswin_k400_videos.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 82.40 | [VideoSwin.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams) |
+| 行为识别 | [TimeSformer](./recognition/timesformer.md) | [timesformer.yaml](../../../configs/recognition/timesformer/timesformer_k400_videos.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 77.29 | [TimeSformer.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |
+| 行为识别 | [SlowFast](./recognition/slowfast.md) | [slowfast_multigrid.yaml](../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 75.84 | [SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |
+| 行为识别 | [TSM](./recognition/tsm.md) | [tsm.yaml](../../../configs/recognition/tsm/tsm_k400_frames.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 70.86 | [TSM.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams) |
+| 行为识别 | [TSN](./recognition/tsn.md) | [tsn.yaml](../../../configs/recognition/tsn/tsn_k400_frames.yaml) | [Kinetics-400](../dataset/k400.md) | Top-1 | 69.81 | [TSN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |
+| 行为识别 | [AttentionLSTM](./recognition/attention_lstm.md) | [attention_lstm.yaml](../../../configs/recognition/attention_lstm/attention_lstm.yaml) | [Youtube-8M](../dataset/youtube8m.md) | Hit@1 | 89.0 | [AttentionLstm.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/AttentionLstm/AttentionLstm.pdparams) |
+| 视频动作定位| [BMN](./localization/bmn.md) | [bmn.yaml](../../../configs/localization/bmn.yaml) | [ActivityNet](../dataset/ActivityNet.md) | AUC | 67.23 | [BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams) |
+| 视频切分 | [TransNetV2](./partition/transnetv2.md) | [transnetv2.yaml](../../../configs/partitioners/transnetv2/transnetv2.yaml) | ClipShots | F1 scores | 76.1 | |
+| 深度估计 | [ADDS](./estimation/adds.md) | [adds.yaml](../../../configs/estimation/adds/adds.yaml) | Oxford_RobotCar | Abs Rel | 0.209 | [ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams) |
+
+
+# 参考文献
+
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
+- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
+- [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383v1), Ji Lin, Chuang Gan, Song Han
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf) Gedas Bertasius, Heng Wang, Lorenzo Torresani
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
+- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tomáš Souček, Jakub Lokoč
+- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Lina Liu, Xibin Song, Mengmeng Wang
diff --git a/docs/zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md b/docs/zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0a64e948060f256d63586a83aff0ffdcc722144
--- /dev/null
+++ b/docs/zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md
@@ -0,0 +1,140 @@
+简体中文 | [English](../../../en/model_zoo/detection/SlowFast_FasterRCNN_en.md)
+
+# SlowFast_FasterRCNN
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+
+在开始使用之前,您需要按照以下命令安装额外的依赖包:
+```bash
+python -m pip install moviepy
+python -m pip install et_xmlfile
+python -m pip install paddledet
+```
+
+## 模型简介
+
+[SlowFast](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md)模型是视频领域的高精度模型之一,对于动作识别任务,还需要检测出当前画面人物,因此SlowFast_FasterRCNN模型以人的检测结果和视频数据为输入,通过SlowFast模型提取时空特征,然后利用FasterRCNN的head得到画面中每个人的动作和位置。
+
+我们提供了详尽理论及代码讲解,并可使用免费在线GPU算力资源,一键运行的AI Studio Notebook项目,使用链接:[基于SlowFast+FasterRCNN的动作识别](https://aistudio.baidu.com/aistudio/projectdetail/3267637?contributionType=1)
+
+详细内容请参考论文[SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf)中AVA Action Detection相关内容。
+
+## 数据准备
+
+本项目利用[AVA数据集](https://research.google.com/ava/download.html)进行动作检测。AVA v2.2数据集包括430个视频,其中235个用于训练,64个用于验证,131个用于测试。对每个视频中15分钟的帧进行了标注,每秒标注一帧。标注文件格式为CSV。
+
+相关处理脚本在`data/ava/script`目录下。
+
+### 1 下载视频
+```
+bash download_videos.sh
+```
+
+### 2 下载标注
+```
+bash download_annotations.sh
+```
+
+### 3 下载检测结果
+
+```
+bash fetch_ava_proposals.sh
+```
+
+### 4 视频切割
+把下载的视频中第15分钟起后面的15分钟的片段切割出来:
+
+```
+bash cut_videos.sh
+```
+
+### 5 提取视频帧
+```
+bash extract_rgb_frames.sh
+```
+
+此处以AVA v2.1版本为例,进行关键文件介绍:
+* ava_videos_15min_frames文件夹中存放以FPS为帧率抽取的视频帧;
+* ava_train_v2.1.csv文件存放训练数据标注;
+* ava_train_excluded_timestamps_v2.1.csv文件中存放废弃的时间戳数据;
+* ava_dense_proposals_train.FAIR.recall_93.9.pkl文件中为每个关键帧中人的位置和置信度数据;
+* ava_action_list_v2.1_for_activitynet_2018.pbtxt为动作类别数据。
+
+## 模型训练
+
+下载预训练模型:
+```
+wget https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams
+```
+
+
+* `-c`后面的参数是配置文件的路径。
+* `-w`后面的参数是finetuning或者测试时的权重,本案例将在Kinetics 400上训练的SlowFast R50模型作为预训练权重,通过下面的表格可获取。
+* `--validate`参数表示在训练过程中进行模型评估。
+
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=logdir.ava main.py --validate -w SlowFast_8*8.pdparams -c configs/detection/ava/ava.yaml
+```
+
+## 模型评估
+
+基于训练好的模型进行评估:
+```
+python main.py --test \
+ -w output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams \
+ -c configs/detection/ava/ava.yaml
+```
+
+| architecture | depth | Pretrain Model | frame length x sample rate | MAP | AVA version | model |
+| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- |
+| SlowFast | R50 | [Kinetics 400](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) | 8 x 8 | 23.2 | 2.1 | [`link`](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SlowFastRCNN_AVA.pdparams) |
+
+
+## 模型推理
+
+本项目动作识别分成两个阶段,第一个阶段得到人的proposals,然后再输入到SlowFast+FasterRCNN模型中进行动作识别。
+
+对于画面中人的检测,可利用[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)中的模型。
+
+PaddleDetection安装:
+```
+# 安装其他依赖
+cd PaddleDetection/
+pip install -r requirements.txt
+
+# 编译安装paddledet
+python setup.py install
+```
+
+下载训练好的检测模型参数:
+```
+wget https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams
+```
+
+导出模型:
+
+```
+!python tools/export_model.py \
+ -c configs/detection/ava/ava.yaml \
+ -o inference_output \
+ -p output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams
+```
+
+基于导出的模型做推理:
+
+```
+python tools/predict.py \
+ -c configs/detection/ava/ava.yaml \
+ --input_file "data/-IELREHXDEMO.mp4" \
+ --model_file "inference_output/AVA_SlowFast_FastRcnn.pdmodel" \
+ --params_file "inference_output/AVA_SlowFast_FastRcnn.pdiparams" \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
diff --git a/docs/zh-CN/model_zoo/estimation/adds.md b/docs/zh-CN/model_zoo/estimation/adds.md
new file mode 100644
index 0000000000000000000000000000000000000000..339507687e18c1a506cd565290bd7a735fd2b15c
--- /dev/null
+++ b/docs/zh-CN/model_zoo/estimation/adds.md
@@ -0,0 +1,133 @@
+[English](../../../en/model_zoo/estimation/adds.md) | 简体中文
+
+# ADDS-DepthNet模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+在开始使用之前,您需要按照以下命令安装额外的依赖包:
+```bash
+python -m pip install scikit-image
+python -m pip install matplotlib
+```
+
+## 模型简介
+
+本模型以百度机器人与自动驾驶实验室的**ICCV 2021论文 [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628)** 为参考,
+复现了基于白天和夜晚图像的自监督单目深度估计模型,其利用了白天和夜晚的图像数据互补性质,减缓了昼夜图像较大的域偏移以及照明变化对深度估计的精度带来的影响,在具有挑战性的牛津RobotCar数据集上实现了全天图像的最先进的深度估计结果。
+
+
+## 数据准备
+
+Oxford RobotCar dataset数据下载及准备请参考[Oxford RobotCar dataset数据准备](../../dataset/Oxford_RobotCar.md)
+
+
+## 模型训练
+
+### Oxford RobotCar dataset数据集训练
+
+#### 下载并添加预训练模型
+
+1. 下载图像预训练模型[resnet18.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams)作为Backbone初始化参数,或通过wget命令下载
+
+ ```bash
+ wget -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams
+ ```
+
+2. 打开`PaddleVideo/configs/estimation/adds/adds.yaml`,将下载好的权重存放路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL: #MODEL field
+ framework: "DepthEstimator" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
+ backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
+ name: 'ADDS_DepthNet'
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- Oxford RobotCar dataset数据集使用单卡训练,训练方式的启动命令如下:
+
+ ```bash
+ python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20
+ ```
+
+
+## 模型测试
+
+- ADDS-DepthNet模型在训练时同步进行验证(只对白天或者夜晚的数据进行验证),您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+ ```bash
+ Already save the best model (rmse)8.5531
+ ```
+
+- 由于模型暂时一次只能测试yaml文件中给定路径的一个白天或者夜晚的数据集,因此若要得到本文档开头处的完整测试分数,需要运行4次测试命令并分别记录下它们的指标(白天40m、白天60m、夜晚40m、夜晚60m)
+
+- 训练好的模型下载地址:[ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams)
+
+- 测试命令如下:
+
+ ```bash
+ # 夜晚40m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=40
+
+ # 夜晚60m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=60
+
+ # 白天40m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=40
+
+ # 白天60m
+ python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=60
+ ```
+
+ 在Oxford RobotCar dataset的validation数据集上的测试指标如下:
+
+ | version | Max Depth | Abs Rel | Sq Rel | RMSE | RMSE log |
|
|
|
+ | ----------- | --------- | ------- | ------ | ----- | -------- | ----------------- | ------------------- | ------------------- |
+ | ours(night) | 40 | 0.209 | 1.741 | 6.031 | 0.243 | 0.708 | 0.923 | 0.975 |
+ | ours(night) | 60 | 0.207 | 2.052 | 7.888 | 0.258 | 0.686 | 0.909 | 0.970 |
+ | ours(day) | 40 | 0.114 | 0.574 | 3.411 | 0.157 | 0.860 | 0.977 | 0.993 |
+ | ours(day) | 60 | 0.119 | 0.793 | 4.842 | 0.173 | 0.838 | 0.967 | 0.991 |
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/estimation/adds/adds.yaml -p data/ADDS_car.pdparams -o inference/ADDS
+```
+
+上述命令将生成预测所需的模型结构文件`ADDS.pdmodel`和模型权重文件`ADDS.pdiparams`以及`ADDS.pdiparams.info`文件,均存放在`inference/ADDS/`目录下
+
+上述bash命令中各个参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.png \
+ --config configs/estimation/adds/adds.yaml \
+ --model_file inference/ADDS/ADDS.pdmodel \
+ --params_file inference/ADDS/ADDS.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+推理结束会默认以伪彩的方式保存下模型估计出的深度图。
+
+以下是样例图片和对应的预测深度图:
+
+
+
+
+
+
+## 参考论文
+
+- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun
diff --git a/docs/zh-CN/model_zoo/localization/bmn.md b/docs/zh-CN/model_zoo/localization/bmn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f923e86a6fe5598153bd6101695d04a3115937f4
--- /dev/null
+++ b/docs/zh-CN/model_zoo/localization/bmn.md
@@ -0,0 +1,128 @@
+[English](../../../en/model_zoo/localization/bmn.md) | 简体中文
+
+# BMN 视频动作定位模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+BMN模型是百度自研,2019年ActivityNet夺冠方案,为视频动作定位问题中proposal的生成提供高效的解决方案,在PaddlePaddle上首次开源。此模型引入边界匹配(Boundary-Matching, BM)机制来评估proposal的置信度,按照proposal开始边界的位置及其长度将所有可能存在的proposal组合成一个二维的BM置信度图,图中每个点的数值代表其所对应的proposal的置信度分数。网络由三个模块组成,基础模块作为主干网络处理输入的特征序列,TEM模块预测每一个时序位置属于动作开始、动作结束的概率,PEM模块生成BM置信度图。
+
+AI Studio项目使用链接:[ActivityNet Challenge 2019 冠军模型:BMN](https://aistudio.baidu.com/aistudio/projectdetail/2250674?contributionType=1)
+
+
+
+BMN Overview
+
+
+## 数据准备
+
+BMN的训练数据采用ActivityNet1.3提供的数据集,数据下载及准备请参考[ActivityNet数据说明](../../dataset/ActivityNet.md)
+
+## 模型训练
+
+数据准备完毕后,可以通过如下方式启动训练:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_bmn main.py --validate -c configs/localization/bmn.yaml
+```
+
+- 从头开始训练,使用上述启动命令行或者脚本程序即可启动训练,不需要用到预训练模型
+
+### 单卡训练
+
+单卡训练请将配置文件中的`DATASET.batch_size`字段修改为16,如下:
+
+```yaml
+DATASET: #DATASET field
+ batch_size: 16 #single card bacth size
+```
+
+单卡训练启动方式如下:
+
+```bash
+python -B main.py --validate -c configs/localization/bmn.yaml
+```
+
+
+## 模型测试
+
+可通过如下方式进行模型测试:
+
+```bash
+python main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00009.pdparams -o DATASET.test_batch_size=1
+```
+
+- 目前仅支持**单卡**, `batch_size`为**1**进行模型测试,
+
+- 请下载[activity\_net\_1\_3\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json)文件,并通过`METRIC.ground_truth_filename`字段指定该ground_truth文件,相较于原始的activity\_net.v1-3.min.json文件,我们过滤了其中一些失效的视频条目。
+
+- 通过 `-w`参数指定待测试模型文件的路径,您可以下载我们训练好的模型进行测试[BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams)
+
+- 上述程序会将运行结果保存在配置文件`METRIC.output_path`字段指定的路径,默认为`data/bmn/BMN_Test_output`文件夹下,测试结果保存在配置文件`METRIC.result_path`字段指定的文件,默认为`data/bmn/BMN_Test_results/bmn_results_validation.json`文件。
+
+- 我们基于ActivityNet官方提供的测试脚本,计算AR@AN和AUC。具体计算过程请参考[anet_prop.py](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/metrics/ActivityNet/anet_prop.py)文件。
+
+- 注:评估时可能会出现loss为nan的情况。这是由于评估时用的是单个样本,可能存在没有iou>0.6的样本,所以为nan,对最终的评估结果没有影响。
+
+在ActivityNet1.3数据集下评估精度如下:
+
+| AR@1 | AR@5 | AR@10 | AR@100 | AUC |
+| :---: | :---: | :---: | :---: | :---: |
+| 33.26 | 49.48 | 56.86 | 75.19 | 67.23% |
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/localization/bmn.yaml \
+ -p data/BMN.pdparams \
+ -o inference/BMN
+```
+
+上述命令将生成预测所需的模型结构文件`BMN.pdmodel`和模型权重文件`BMN.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example_feat.list \
+ --config configs/localization/bmn.yaml \
+ --model_file inference/BMN/BMN.pdmodel \
+ --params_file inference/BMN/BMN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+- `input_file`为文本文件,指定待推断的文件信息,包括特征文件路径`feat_path`和视频时长(单位:s)`duration_second`。
+
+输出示例如下:
+
+```
+BMN Inference results of data/example_feat.npy :
+{'score': 0.7968077063560486, 'segment': [0.0, 122.9877]}
+{'score': 0.49097609519958496, 'segment': [12.423000000000002, 124.23]}
+{'score': 0.21395835280418396, 'segment': [39.7536, 122.9877]}
+{'score': 0.2106524258852005, 'segment': [0.0, 109.3224]}
+{'score': 0.06876271963119507, 'segment': [23.6037, 114.2916]}
+```
+
+- 默认只打印前5个得分最高的proposal,所有的预测结果可在输出文件中查看,默认输出文件路径为`data/bmn/BMN_INFERENCE_results`。输出路径可在配置文件中的`INFERENCE.result_path`自行修改。
+
+## 参考论文
+
+- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
diff --git a/docs/zh-CN/model_zoo/multimodal/actbert.md b/docs/zh-CN/model_zoo/multimodal/actbert.md
new file mode 100644
index 0000000000000000000000000000000000000000..3853968c25ea4e00274124e801282f244c1c9f87
--- /dev/null
+++ b/docs/zh-CN/model_zoo/multimodal/actbert.md
@@ -0,0 +1,103 @@
+[English](../../../en/model_zoo/multimodal/actbert.md) | 简体中文
+
+# ActBERT多模态预训练模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [参考论文](#参考论文)
+
+在开始使用之前,您需要按照以下命令安装额外的依赖包:
+```bash
+python -m pip install paddlenlp
+python -m pip install lmdb
+```
+
+## 模型简介
+
+ActBERT是百度在CVPR2020提出的多模态预训练模型,它结合输入文本、图像和视频动作三种模态,使用一种全新的纠缠编码模块从三个来源进行多模态特征学习,以增强两个视觉输入和语言之间的互动功能。模型采用RandomMask和NSP的方式进行训练,在文本视频搜索、视频描述生成等5个下游任务中表现优异。
+
+
+
+
+
+
+## 数据准备
+
+HowTo100M数据下载及准备请参考[HowTo100M数据准备](../../dataset/howto100m.md)
+
+MSR-VTT数据下载及准备请参考[MSR-VTT数据准备](../../dataset/msrvtt.md)
+
+
+## 模型训练
+
+### HowTo100M数据集训练
+
+#### 下载并添加预训练模型
+
+下载BERT预训练模型[bert-base-uncased](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams)作为Backbone初始化参数,或是通过命令行下载
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams
+```
+
+并将文件路径添加到配置文件中的`MODEL.framework.backbone.pretrained`字段,如下:
+
+```yaml
+MODEL:
+ framework: "ActBert"
+ backbone:
+ name: "BertForMultiModalPreTraining"
+ pretrained: 将路径填写到此处
+```
+
+- 由于训练数据集过大,本代码提供小数据训练功能,训练配置仅供参考~
+
+#### 开始训练
+
+- 训练启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_actbert main.py --validate -c configs/multimodal/actbert/actbert.yaml
+```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_actbert main.py --amp --validate -c configs/multimodal/actbert/actbert.yaml
+```
+
+- 另外您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的。
+
+
+## 模型测试
+
+- 对下游任务:文本-视频检索,在MSR-VTT数据集上评估性能,评估脚本启动方式如下:
+
+
+```bash
+python3.7 main.py --test -c configs/multimodal/actbert/actbert_msrvtt.yaml -w Actbert.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+
+MSR-VTT数据集测试精度:
+
+| R@1 | R@5 | R@10 | Median R | Mean R | checkpoints |
+| :------: | :----------: | :----: | :----: | :----: | :----: |
+| 8.6 | 31.2 | 45.5 | 13.0 | 28.5 | [ActBERT.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ActBERT.pdparams) |
+
+
+## 参考论文
+
+- [ActBERT: Learning Global-Local Video-Text Representations
+](https://arxiv.org/abs/2011.07231), Linchao Zhu, Yi Yang
diff --git a/docs/zh-CN/model_zoo/partition/transnetv2.md b/docs/zh-CN/model_zoo/partition/transnetv2.md
new file mode 100644
index 0000000000000000000000000000000000000000..51c2510941e59624f9b68000b580396b426f3cc5
--- /dev/null
+++ b/docs/zh-CN/model_zoo/partition/transnetv2.md
@@ -0,0 +1,85 @@
+[English](../../../en/model_zoo/partition/transnetv2.md) | 简体中文
+
+# TransNetV2视频切分模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+在开始使用之前,您需要按照以下命令安装额外的依赖包:
+```bash
+python -m pip install ffmpeg-python==0.2.0
+```
+
+## 模型简介
+
+TransNetV2是一种基于深度学习的视频切分模型,通过DDCNN V2结构进行特征学习,并加入RGB颜色直方图、视频帧相似度进行更有效的特征提取,最终获取每一帧是否是镜头边界帧的概率,从而完成视频切分。该算法效果较好,且计算高效,十分适合工业落地。
+
+
+
+本代码当前仅支持模型推理,模型的训练和测试将在后续提供。
+
+
+## 数据准备
+
+coming soon
+
+
+## 模型训练
+
+coming soon
+
+
+## 模型测试
+
+coming soon
+
+
+## 模型推理
+
+下载在ClipShots和TRECVID IACC.3上训练好的TransNetV2模型参数 [TransNetV2_shots.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams),也可以通过命令行下载
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams
+```
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/partitioners/transnetv2/transnetv2.yaml -p data/TransNetV2_shots.pdparams -o inference/TransNetV2
+```
+
+上述命令将生成预测所需的模型结构文件`TransNetV2.pdmodel`和模型权重文件`TransNetV2.pdiparams`以及`TransNetV2.pdiparams.info`文件,均存放在`inference/TransNetV2/`目录下
+
+上述bash命令中各个参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/partitioners/transnetv2/transnetv2.yaml \
+ --model_file inference/TransNetV2/TransNetV2.pdmodel \
+ --params_file inference/TransNetV2/TransNetV2.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+通过定义配置文件`transnetv2.yaml`中`output_path`参数,可以将每帧的预测概率输出到`{output_path}/example_predictions.txt`中,预测得到的镜头边界输出到`{output_path}/example_scenes.txt`中。
+通过定义配置文件`transnetv2.yaml`中`visualize`参数为True,可以将预测结果可视化,可视化结果保存至`{output_path}/example_vis.png`。
+
+输出示例如下:
+
+```bash
+Current video file: data/example.avi
+ Shot Boundarys: [[ 0 130]]
+```
+
+可以看到,使用TransNetV2模型对`data/example.avi`进行预测,输出的视频镜头边界帧为[0,130]。
+## 参考论文
+
+- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tomáš Souček, Jakub Lokoč
diff --git a/docs/zh-CN/model_zoo/recognition/agcn.md b/docs/zh-CN/model_zoo/recognition/agcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e66a74f30a6e8a4c90db36f3ecf344d5d002d1a
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/agcn.md
@@ -0,0 +1,134 @@
+[English](../../../en/model_zoo/recognition/agcn.md) | 简体中文
+
+# AGCN基于骨骼的行为识别模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+
+我们对[ST-GCN模型](./stgcn.md)进行了优化,实现了精度更高的AGCN模型,模型优化细节参考[AGCN模型解析](https://www.bilibili.com/video/BV1w3411172G).
+
+
+## 数据准备
+
+花样滑冰比赛数据下载及准备请参考[花样滑冰数据准备](../../dataset/fsd.md)
+
+NTU-RGBD数据下载及准备请参考[NTU-RGBD数据准备](../../dataset/ntu-rgbd.md)
+
+## 模型训练
+
+### 花样滑冰比赛数据集训练
+
+- 花样滑冰比赛数据集使用单卡训练,启动命令如下:
+
+```bash
+python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml
+```
+
+- 由于赛事未提供验证集数据,因此训练时不做valid。
+
+- 您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,参数用法请参考[config](../../tutorials/config.md)。
+
+### NTU-RGBD数据集训练
+
+- NTU-RGBD数据集使用4卡训练,启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_agcn main.py --validate -c configs/recognition/agcn/agcn_ntucs.yaml
+```
+
+- `agcn_ntucs.yaml`配置文件为NTU-RGB+D数据集按cross-subject划分方式对应的训练配置。
+
+
+## 模型测试
+
+### 花样滑冰比赛数据集模型测试
+
+- 模型测试的启动命令如下:
+
+```bash
+python3.7 main.py --test -c configs/recognition/agcn/agcn_fsd.yaml -w output/AGCN/AGCN_epoch_00100.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+- 评估结果保存在submission.csv文件中,可在[评测官网](https://aistudio.baidu.com/aistudio/competition/detail/115)提交查看得分。
+
+模型在花样滑冰比赛数据集上baseline实验精度如下:
+
+| Test_Data | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| Test_A | 62.29 | [AGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams) |
+
+
+### NTU-RGB+D数据集模型测试
+
+- 模型测试的启动命令如下:
+
+```bash
+python3.7 main.py --test -c configs/recognition/agcn/agcn_ntucs.yaml -w output/AGCN/AGCN_best.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+模型在NTU-RGB+D数据集上实验精度如下:
+
+| split | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| cross-subject | 83.27 | [AGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_ntucs.pdparams) |
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml \
+ -p data/AGCN_fsd.pdparams \
+ -o inference/AGCN
+```
+
+上述命令将生成预测所需的模型结构文件`AGCN.pdmodel`和模型权重文件`AGCN.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
+ --config configs/recognition/agcn/agcn_fsd.yaml \
+ --model_file inference/AGCN/AGCN.pdmodel \
+ --params_file inference/AGCN/AGCN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.8965644240379333
+```
+
+可以看到,使用在FSD上训练好的AGCN模型对`data/example_skeleton.npy`进行预测,输出的top1类别id为`27`,置信度为0.89。
+
+## 参考论文
+
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
+
+- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+
+- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+
+- Many thanks to [li7819559](https://github.com/li7819559) and [ZhaoJingjing713](https://github.com/ZhaoJingjing713) for contributing the code.
diff --git a/docs/zh-CN/model_zoo/recognition/attention_lstm.md b/docs/zh-CN/model_zoo/recognition/attention_lstm.md
new file mode 100644
index 0000000000000000000000000000000000000000..df04f07f2c780d3eac543d0cbbcd74852a81eb0b
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/attention_lstm.md
@@ -0,0 +1,86 @@
+简体中文 | [English](../../../en/model_zoo/recognition/attention_lstm.md)
+
+# AttentionLSTM
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+## 模型简介
+
+循环神经网络(RNN)常用于序列数据的处理,可建模视频连续多帧的时序信息,在视频分类领域为基础常用方法。
+该模型采用了双向长短时记忆网络(LSTM),将视频的所有帧特征依次编码。与传统方法直接采用LSTM最后一个时刻的输出不同,该模型增加了一个Attention层,每个时刻的隐状态输出都有一个自适应权重,然后线性加权得到最终特征向量。参考论文中实现的是两层LSTM结构,而**本模型实现的是带Attention的双向LSTM**。
+
+Attention层可参考论文[AttentionCluster](https://arxiv.org/abs/1711.09550)
+
+## 数据准备
+
+PaddleVide提供了在Youtube-8M数据集上训练和测试脚本。Youtube-8M数据下载及准备请参考[YouTube-8M数据准备](../../dataset/youtube8m.md)
+
+## 模型训练
+
+### Youtube-8M数据集训练
+
+#### 开始训练
+
+- Youtube-8M数据集使用8卡训练,feature格式下会使用视频和音频特征作为输入,数据的训练启动命令如下
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+ ```
+
+## 模型测试
+
+命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm main.py --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml -w "output/AttentionLSTM/AttentionLSTM_best.pdparams"
+```
+
+当测试配置采用如下参数时,在Youtube-8M的validation数据集上的测试指标如下:
+
+| Hit@1 | PERR | GAP | checkpoints |
+| :-----: | :---------: | :---: | ----- |
+| 89.05 | 80.49 | 86.30 | [AttentionLSTM_yt8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams) |
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
+ -p data/AttentionLSTM_yt8.pdparams \
+ -o inference/AttentionLSTM
+```
+
+上述命令将生成预测所需的模型结构文件`AttentionLSTM.pdmodel`和模型权重文件`AttentionLSTM.pdiparams`。
+
+各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-模型推理)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.pkl \
+ --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
+ --model_file inference/AttentionLSTM/AttentionLSTM.pdmodel \
+ --params_file inference/AttentionLSTM/AttentionLSTM.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+输出示例如下:
+```bash
+Current video file: data/example.pkl
+ top-1 class: 11
+ top-1 score: 0.9841002225875854
+```
+可以看到,使用在Youtube-8M上训练好的AttentionLSTM模型对data/example.pkl进行预测,输出的top1类别id为11,置信度为0.98。
+## 参考论文
+
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan
+
diff --git a/docs/zh-CN/model_zoo/recognition/ctrgcn.md b/docs/zh-CN/model_zoo/recognition/ctrgcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1d7a2721c638a0e6c01924f5415ae6464114d1d
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/ctrgcn.md
@@ -0,0 +1,132 @@
+[English](../../../en/model_zoo/recognition/ctrgcn.md) | 简体中文
+
+# CTR-GCN基于骨骼的行为识别模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+[CTR-GCN](https://github.com/Uason-Chen/CTR-GCN.git)是ICCV 2021提出的基于骨骼的行为识别模型,通过将改动应用在具有拓扑结构的人体骨骼数据上的图卷积,使用时空图卷积提取时空特征进行行为识别,提升了基于骨骼的行为识别任务精度。
+
+
+
+
+
+
+## 数据准备
+
+NTU-RGBD数据下载及准备请参考[NTU-RGBD数据准备](../../dataset/ntu-rgbd.md)
+
+
+## 模型训练
+
+### NTU-RGBD数据集训练
+
+- NTU-RGBD数据集单卡训练,启动命令如下:
+
+```bash
+# joint modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml --seed 1
+
+# bone modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml --seed 1
+
+# motion modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml --seed 1
+
+# bone motion modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml --seed 1
+```
+
+- NTU-RGBD数据集使用4卡训练,启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_ctrgcn main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml
+```
+
+- 配置文件`ctrgcn_ntucs_joint.yaml`为NTU-RGB+D数据集按cross-subject划分方式对应的训练配置。
+
+
+## 模型测试
+
+### NTU-RGB+D数据集模型测试
+
+- 模型测试的启动命令如下:
+
+```bash
+# joint modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml -w data/CTRGCN_ntucs_joint.pdparams
+
+# bone modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml -w data/CTRGCN_ntucs_bone.pdparams
+
+# motion modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml -w data/CTRGCN_ntucs_motion.pdparams
+
+# bone motion modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml -w data/CTRGCN_ntucs_bone_motion.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+模型在NTU-RGB+D数据集上实验精度如下:
+
+| split | modality | Top-1 | checkpoints |
+| :----: | :----: | :----: | :----: |
+| cross-subject | joint | 89.93 | [CTRGCN_ntucs_joint.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_joint.pdparams) |
+| cross-subject | bone | 85.24 | [CTRGCN_ntucs_bone.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone.pdparams) |
+| cross-subject | motion | 85.33 | [CTRGCN_ntucs_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_motion.pdparams) |
+| cross-subject | bone motion | 84.53 | [CTRGCN_ntucs_bone_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone_motion.pdparams) |
+
+
+
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
+ -p data/CTRGCN_ntucs_joint.pdparams \
+ -o inference/CTRGCN
+```
+
+上述命令将生成预测所需的模型结构文件`CTRGCN_joint.pdmodel`和模型权重文件`CTRGCN_joint.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \
+ --config configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
+ --model_file inference/CTRGCN_joint/CTRGCN_joint.pdmodel \
+ --params_file inference/CTRGCN_joint/CTRGCN_joint.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```
+Current video file: data/example_NTU-RGB-D_sketeton.npy
+ top-1 class: 4
+ top-1 score: 0.999988317489624
+```
+
+可以看到,使用在NTU-RGBD数据集上训练好的ST-GCN模型对`data/example_NTU-RGB-D_sketeton.npy`进行预测,输出的top1类别id为`4`,置信度为0.999988317489624。
+
+
+## 参考论文
+
+- [Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213), Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming
diff --git a/docs/zh-CN/model_zoo/recognition/movinet.md b/docs/zh-CN/model_zoo/recognition/movinet.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f5ba7dd40664ff260a2686d61abdc47e9ae9f10
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/movinet.md
@@ -0,0 +1,90 @@
+[English](../../../en/model_zoo/recognition/movinet.md) | 简体中文
+
+# MoViNet视频分类模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+MoViNet是Google Research研发的移动视频网络。它使用神经结构搜索的方法来搜索MoViNet空间结构,使用因果卷积算子和流缓冲区来弥补准确率的损失,Temporal Ensembles提升准确率,是一个可以用于在线推理视频流的,轻量高效视频模型。
+
+## 数据准备
+
+Kinetics-400数据下载及准备请参考[kinetics-400数据准备](../../dataset/k400.md)
+
+## 模型训练
+
+数据准备完成后,可通过如下方式启动训练:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_movinet main.py --validate -c configs/recognition/movinet/movinet_k400_frame.yaml
+```
+
+## 模型测试
+
+- MoViNet模型在训练时同步进行测试,您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+```txt
+Already save the best model (top1 acc)0.6489
+```
+
+- 若需单独运行测试代码,其启动命令如下:
+
+```bash
+python3.7 main.py --test -c configs/recognition/movinet/movinet_k400_frame.yaml -w output/MoViNet/MoViNet_best.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+当测试配置采用如下参数时,在Kinetics-400的validation数据集上的评估精度如下:
+
+| Config | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :--------: | :-------: | :-------: | :-----: | :-----: |
+| A0 | Uniform | 50 | 172 | 66.62 | [MoViNetA0_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/MoViNetA0_k400.pdparams) |
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/movinet/movinet_k400_frame.yaml \
+ -p data/MoViNetA0_k400.pdparams \
+ -o inference/MoViNetA0
+```
+
+上述命令将生成预测所需的模型结构文件`MoViNetA0.pdmodel`和模型权重文件`MoViNetA0.pdiparams`。
+
+各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/movinet/movinet_k400_frame.yaml \
+ --model_file inference/MoViNetA0/MoViNet.pdmodel \
+ --params_file inference/MoViNetA0/MoViNet.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+```txt
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.7667049765586853
+```
+
+## 参考论文
+
+- [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511)
diff --git a/docs/zh-CN/model_zoo/recognition/pp-timesformer.md b/docs/zh-CN/model_zoo/recognition/pp-timesformer.md
new file mode 100644
index 0000000000000000000000000000000000000000..0cb3cf25cda66526d38d22a51d29ae7cbdd756b9
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/pp-timesformer.md
@@ -0,0 +1,157 @@
+[English](../../../en/model_zoo/recognition/pp-timesformer.md) | 简体中文
+
+# PP-TimeSformer视频分类模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+我们对[TimeSformer模型](./timesformer.md)进行了改进和优化,得到了更高精度的2D实用视频分类模型**PP-TimeSformer**。在不增加参数量和计算量的情况下,在UCF-101、Kinetics-400等数据集上精度显著超过原版,在Kinetics-400数据集上的精度如下表所示。
+
+| Version | Top1 |
+| :------ | :----: |
+| Ours ([swa](#refer-anchor-1)+distill+16frame) | 79.44 |
+| Ours ([swa](#refer-anchor-1)+distill) | 78.87 |
+| Ours ([swa](#refer-anchor-1)) | **78.61** |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/timesformer#kinetics-400) | 77.92 |
+
+
+## 数据准备
+
+K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
+
+UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
+
+
+## 模型训练
+
+### Kinetics-400数据集训练
+
+#### 下载并添加预训练模型
+
+1. 下载图像预训练模型[ViT_base_patch16_224_miil_21k.pdparams](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams)作为Backbone初始化参数,或通过wget命令下载
+
+ ```bash
+ wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+ ```
+
+2. 打开`PaddleVideo/configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml`,将下载好的权重存放路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL:
+ framework: "RecognizerTransformer"
+ backbone:
+ name: "VisionTransformer"
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- Kinetics400数据集使用8卡训练,训练方式的启动命令如下:
+
+ ```bash
+ # videos数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml
+ ```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 # MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+ # videos数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --amp --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml
+ ```
+
+- 另外您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`,参数用法请参考[config](../../tutorials/config.md)。
+
+
+## 模型测试
+
+- PP-TimeSformer模型在训练时同步进行验证,您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+ ```
+ Already save the best model (top1 acc)0.7258
+ ```
+
+- 由于PP-TimeSformer模型测试模式的采样方式是速度稍慢但精度高一些的**UniformCrop**,与训练过程中验证模式采用的**RandomCrop**不同,所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数,因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标,命令如下:
+
+ ```bash
+ # 8-frames 模型测试命令
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --test -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml -w "output/ppTimeSformer/ppTimeSformer_best.pdparams"
+
+ # 16-frames模型测试命令
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --test \
+ -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+ -o MODEL.backbone.num_seg=16 \
+ -o MODEL.runtime_cfg.test.num_seg=16 \
+ -o MODEL.runtime_cfg.test.avg_type='prob' \
+ -o PIPELINE.test.decode.num_seg=16 \
+ -o PIPELINE.test.sample.num_seg=16 \
+ -w "data/ppTimeSformer_k400_16f_distill.pdparams"
+ ```
+
+
+ 当测试配置采用如下参数时,在Kinetics-400的validation数据集上的测试指标如下:
+
+ | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+ | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
+ | Vision Transformer | UniformCrop | 8 | 224 | 78.61 | [ppTimeSformer_k400_8f.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f.pdparams) |
+ | Vision Transformer | UniformCrop | 8 | 224 | 78.87 | [ppTimeSformer_k400_8f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f_distill.pdparams) |
+ | Vision Transformer | UniformCrop | 16 | 224 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |
+
+
+- 测试时,PP-TimeSformer视频采样策略为使用linspace采样:时序上,从待采样视频序列的第一帧到最后一帧区间内,均匀生成`num_seg`个稀疏采样点(包括端点);空间上,选择长边两端及中间位置(左中右 或 上中下)3个区域采样。1个视频共采样1个clip。
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+ -p data/ppTimeSformer_k400_8f.pdparams \
+ -o inference/ppTimeSformer
+```
+
+上述命令将生成预测所需的模型结构文件`ppTimeSformer.pdmodel`和模型权重文件`ppTimeSformer.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](../../start.md#2-模型推理)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+ --model_file inference/ppTimeSformer/ppTimeSformer.pdmodel \
+ --params_file inference/ppTimeSformer/ppTimeSformer.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9997474551200867
+```
+
+可以看到,使用在Kinetics-400上训练好的ppTimeSformer模型对`data/example.avi`进行预测,输出的top1类别id为`5`,置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`,可知预测类别名称为`archery`。
+
+## 参考论文
+
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
+
+
+- [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407v3), Pavel Izmailov, Dmitrii Podoprikhin, Timur Garipov
+- [ImageNet-21K Pretraining for the Masses](https://arxiv.org/pdf/2104.10972v4.pdf), Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy
diff --git a/docs/zh-CN/model_zoo/recognition/pp-tsm.md b/docs/zh-CN/model_zoo/recognition/pp-tsm.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa01b3a0df66d70ab4ae0604c27a5cafc3dc18f4
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/pp-tsm.md
@@ -0,0 +1,181 @@
+[English](../../../en/model_zoo/recognition/pp-tsm.md) | 简体中文
+
+# PP-TSM视频分类模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+我们对[TSM模型](./tsm.md)进行了改进,提出了高精度2D实用视频分类模型**PP-TSM**。在不增加参数量和计算量的情况下,在UCF-101、Kinetics-400等数据集上精度显著超过原文,在Kinetics-400数据集上的精度如下表所示。模型优化解析请参考[**PP-TSM模型精度优化Tricks详解**](https://zhuanlan.zhihu.com/p/382134297)。
+
+| Version | Sampling method | Top1 |
+| :------ | :----------: | :----: |
+| Ours (distill) | Dense | **76.16** |
+| Ours | Dense | 75.69 |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) | Dense | 74.55 |
+| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module) | Dense | 74.1 |
+
+| Version | Sampling method | Top1 |
+| :------ | :----------: | :----: |
+| Ours (distill) | Uniform | **75.11** |
+| Ours | Uniform | 74.54 |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) | Uniform | 71.90 |
+| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module) | Uniform | 71.16 |
+
+
+## 数据准备
+
+K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
+
+UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
+
+
+## 模型训练
+
+### Kinetics-400数据集训练
+
+#### 下载并添加预训练模型
+
+下载图像蒸馏预训练模型[ResNet50_vd_ssld_v2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams)作为Backbone初始化参数,或是通过命令行下载
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
+```
+
+并将文件路径添加到配置文件中的`MODEL.framework.backbone.pretrained`字段,如下:
+
+```yaml
+MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTweaksTSM"
+ pretrained: 将路径填写到此处
+```
+
+- 如果使用ResNet101作为Backbone进行训练,请下载预训练模型[ResNet101_vd_ssld_pretrained.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams).
+
+#### 开始训练
+
+- Kinetics400数据集使用8卡训练,frames格式数据,uniform训练方式的启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+```
+
+- Kinetics400数据集使用8卡训练,videos格式数据,uniform训练方式的启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml
+```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --amp --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+```
+
+- Kinetics400数据集frames格式数据,dense训练方式的启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml
+```
+
+- Kinetics400数据集frames格式数据,dense训练方式,ResNet101作为Backbone的启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml
+```
+
+- 另外您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,配置文件命名方式为`模型_数据集_文件格式_数据格式_采样方式.yaml`,参数用法请参考[config](../../tutorials/config.md)。
+
+
+## 模型测试
+
+- 对Uniform采样方式,PP-TSM模型在训练时同步进行测试,您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+```txt
+Already save the best model (top1 acc)0.7454
+```
+
+- 对dense采样方式,需单独运行测试代码,其启动命令如下:
+
+```bash
+python3 main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml -w output/ppTSM/ppTSM_best.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+
+Kinetics400数据集测试精度:
+
+| backbone | distill | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :----------: | :----: | :----: | :----: | :----: | :---- |
+| ResNet50 | False | Uniform | 8 | 224 | 74.54 | [ppTSM_k400_uniform.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams) |
+| ResNet50 | False | Dense | 8 | 224 | 75.69 | [ppTSM_k400_dense.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense.pdparams) |
+| ResNet50 | True | Uniform | 8 | 224 | 75.11 | [ppTSM_k400_uniform_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |
+| ResNet50 | True | Dense | 8 | 224 | 76.16 | [ppTSM_k400_dense_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |
+| ResNet101 | True | Uniform | 8 | 224 | 76.35 | [ppTSM_k400_uniform_distill_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_uniform_distill_r101.pdparams) |
+| ResNet101 | False | Dense | 8 | 224 | 77.15 | [ppTSM_k400_dense_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_dense_r101.pdparams) |
+
+- Uniform采样: 时序上,等分成`num_seg`段,每段中间位置采样1帧;空间上,中心位置采样。1个视频共采样1个clips。
+
+- Dense采样:时序上,先等分成10个片段,每段从起始位置开始,以`64//num_seg`为间隔连续采样`num_seg`帧;空间上,左中,中心,右中3个位置采样。1个视频共采样`10*3=30`个clips。
+
+- distill为`True`表示使用了蒸馏所得的预训练模型。
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+ -p data/ppTSM_k400_uniform.pdparams \
+ -o inference/ppTSM
+```
+
+上述命令将生成预测所需的模型结构文件`ppTSM.pdmodel`和模型权重文件`ppTSM.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+ --model_file inference/ppTSM/ppTSM.pdmodel \
+ --params_file inference/ppTSM/ppTSM.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+
+输出示例如下:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9907386302947998
+```
+
+
+可以看到,使用在Kinetics-400上训练好的PP-TSM模型对`data/example.avi`进行预测,输出的top1类别id为`5`,置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`,可知预测类别名称为`archery`。
+
+
+## 参考论文
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/docs/zh-CN/model_zoo/recognition/pp-tsn.md b/docs/zh-CN/model_zoo/recognition/pp-tsn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3229fdb0dc462df9a222e87f445ad1f5a98c9ef5
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/pp-tsn.md
@@ -0,0 +1,148 @@
+[English](../../../en/model_zoo/recognition/pp-tsn.md) | 简体中文
+
+# PP-TSN视频分类模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+我们对[TSN模型](./tsn.md)进行了改进,得到了更高精度的2D实用视频分类模型**PP-TSN**。在不增加参数量和计算量的情况下,在UCF-101、Kinetics-400等数据集上精度显著超过原版,在Kinetics-400数据集上的精度如下表所示。
+
+| Version | Top1 |
+| :------ | :----: |
+| Ours (distill) | 75.06 |
+| Ours | **73.68** |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn#kinetics-400) | 71.80 |
+
+
+## 数据准备
+
+K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
+
+UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
+
+
+## 模型训练
+
+### Kinetics-400数据集训练
+
+#### 下载并添加预训练模型
+
+1. 下载图像蒸馏预训练模型[ResNet50_vd_ssld_v2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams)作为Backbone初始化参数,或通过wget命令下载
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
+ ```
+
+2. 打开`PaddleVideo/configs/recognition/pptsn/pptsn_k400_frames.yaml`,将下载好的权重存放路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTweaksTSN"
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- Kinetics400数据集使用8卡训练,训练方式的启动命令如下:
+
+ ```bash
+ # frames数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml
+
+ # videos数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --validate -c configs/recognition/pptsn/pptsn_k400_videos.yaml
+ ```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 # MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ # frames数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --amp --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml
+
+ # videos数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --amp --validate -c configs/recognition/pptsn/pptsn_k400_videos.yaml
+ ```
+
+- 另外您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`,参数用法请参考[config](../../tutorials/config.md)。
+
+
+## 模型测试
+
+- PP-TSN模型在训练时同步进行验证,您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+ ```
+ Already save the best model (top1 acc)0.7004
+ ```
+
+- 由于PP-TSN模型测试模式的采样方式是速度稍慢但精度高一些的**TenCrop**,与训练过程中验证模式采用的**CenterCrop**不同,所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数,因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标,命令如下:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --test -c configs/recognition/pptsn/pptsn_k400_frames.yaml -w "output/ppTSN/ppTSN_best.pdparams"
+ ```
+
+
+ 当测试配置采用如下参数时,在Kinetics-400的validation数据集上的测试指标如下:
+
+
+ | backbone | Sampling method | distill | num_seg | target_size | Top-1 | checkpoints |
+ | :------: | :----------: | :----: | :----: | :----: | :---- | :---: |
+ | ResNet50 | TenCrop | False | 3 | 224 | 73.68 | [ppTSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams) |
+ | ResNet50 | TenCrop | True | 8 | 224 | 75.06 | [ppTSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |
+
+- PP-TSN视频采样策略为TenCrop采样:时序上,将待输入视频均匀分成`num_seg`段区间,每段的中间位置采样1帧;空间上,从左上角、右上角、中心点、左下角、右下角5个子区域各采样224x224的区域,并加上水平翻转,一共得到10个采样结果。1个视频共采样1个clip。
+
+- distill为`True`表示使用了蒸馏所得的预训练模型,具体蒸馏方案参考[PP-TSM蒸馏方案](https://zhuanlan.zhihu.com/p/382134297)。
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_frames.yaml -p data/ppTSN_k400.pdparams -o inference/ppTSN
+```
+
+上述命令将生成预测所需的模型结构文件`ppTSN.pdmodel`和模型权重文件`ppTSN.pdiparams`以及`ppTSN.pdiparams.info`文件,均存放在`inference/ppTSN/`目录下
+
+上述bash命令中各个参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/pptsn/pptsn_k400_frames.yaml \
+ --model_file inference/ppTSN/ppTSN.pdmodel \
+ --params_file inference/ppTSN/ppTSN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```bash
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.998979389667511
+```
+
+可以看到,使用在Kinetics-400上训练好的PP-TSN模型对`data/example.avi`进行预测,输出的top1类别id为`5`,置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`,可知预测类别名称为`archery`。
+
+## 参考论文
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/docs/zh-CN/model_zoo/recognition/slowfast.md b/docs/zh-CN/model_zoo/recognition/slowfast.md
new file mode 100644
index 0000000000000000000000000000000000000000..030aaab4b27ccaf9f8546406a494a33a6c1a0947
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/slowfast.md
@@ -0,0 +1,140 @@
+简体中文 | [English](../../../en/model_zoo/recognition/slowfast.md)
+
+# SlowFast视频分类模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+SlowFast是视频分类领域的高精度模型,使用slow和fast两个分支。slow分支以稀疏采样得到的帧作为输入,捕捉视频中的表观信息。fast分支以高频采样得到的帧作为输入,捕获视频中的运动信息,最终将两个分支的特征拼接得到预测结果。
+
+
+
+SlowFast Overview
+
+
+详细内容请参考ICCV 2019论文[SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982)
+
+
+## 数据准备
+
+SlowFast模型的训练数据采用Kinetics400数据集,数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
+
+
+## 模型训练
+
+数据准备完成后,可通过如下方式启动训练:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast main.py --validate -c configs/recognition/slowfast/slowfast.yaml
+```
+
+- 从头开始训练,使用上述启动命令行或者脚本程序即可启动训练,不需要用到预训练模型。
+
+- 建议使用多卡训练方式,单卡由于batch\_size减小,精度可能会有损失。
+
+
+### 训练资源要求
+
+* 8卡V100,总batch\_size=64,单卡batch\_size=8,单卡显存占用约9G。
+* 训练速度相较原始实现提速100%,详细参考[benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/benchmark.md#实验结果)
+
+### 训练加速
+
+SlowFast为3D模型,训练异常耗时,为进一步加速模型的训练,我们实现了[Multigrid加速策略算法](https://arxiv.org/abs/1912.00998),其训练启动方式如下:
+
+```bash
+python -B -m paddle.distributed.launch --selected_gpus="0,1,2,3,4,5,6,7" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml
+```
+
+性能数据如下:
+
+| 训练策略 | 单个epoch平均耗时/min | 训练总时间/min | 加速比 |
+| :------ | :-----: | :------: |:------: |
+| Multigrid | 27.25 | 9758(6.7天) | 2.89x |
+| Normal | 78.76 | 15438(10.7天) | base |
+
+速度详细数据说明可参考[加速文档](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/tutorials/accelerate.md#%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5%E5%8A%A0%E9%80%9F)。
+
+## 模型测试
+
+可通过如下命令进行模型测试:
+
+```bash
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast_test main.py --test -c configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams
+```
+
+- 通过 `-w`参数指定待测试模型文件的路径,您可以下载我们训练好的模型进行测试[SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams)
+
+- 使用```multi_crop```的方式进行评估,因此评估有一定耗时,建议使用多卡评估,加快评估速度。若使用默认方式进行多卡评估,耗时约4小时。
+
+- 模型最终的评估精度会打印在日志文件中。
+
+若使用单卡评估,启动方式如下:
+
+```bash
+python -B main.py --test -c configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams
+```
+
+
+在Kinetics400数据集下评估精度及权重文件如下:
+
+| Configs | Acc1 | Acc5 | Weights |
+| :---: | :---: | :---: | :---: |
+| [slowfast.yaml](../../../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 91.33 | [slowfast_4x16.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams) |
+| [slowfast_multigrid.yaml](../../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | 75.84 | 92.33 | [slowfast_8x8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |
+
+- 由于Kinetics400数据集部分源文件已缺失,无法下载,我们使用的数据集比官方数据少~5%,因此精度相比于论文公布的结果有一定损失。相同数据下,精度已与原实现对齐。
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml \
+ -p data/SlowFast.pdparams \
+ -o inference/SlowFast
+```
+
+上述命令将生成预测所需的模型结构文件`SlowFast.pdmodel`和模型权重文件`SlowFast.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/slowfast/slowfast.yaml \
+ --model_file inference/SlowFast/SlowFast.pdmodel \
+ --params_file inference/SlowFast/SlowFast.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 1.0
+```
+
+可以看到,使用在Kinetics-400上训练好的SlowFast模型对`data/example.avi`进行预测,输出的top1类别id为`5`,置信度为1.0。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`,可知预测类别名称为`archery`。
+
+
+## 参考论文
+
+- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.
+- [A Multigrid Method for Efficiently Training Video Models](https://arxiv.org/abs/1912.00998), Chao-Yuan Wu, Ross Girshick, et al.
diff --git a/docs/zh-CN/model_zoo/recognition/stgcn.md b/docs/zh-CN/model_zoo/recognition/stgcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd8fd884f8b16a3969c48d05fbda6d6c468d1eb9
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/stgcn.md
@@ -0,0 +1,136 @@
+[English](../../../en/model_zoo/recognition/stgcn.md) | 简体中文
+
+# ST-GCN基于骨骼的行为识别模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+ST-GCN是AAAI 2018提出的经典的基于骨骼的行为识别模型,通过将图卷积应用在具有拓扑结构的人体骨骼数据上,使用时空图卷积提取时空特征进行行为识别,极大地提升了基于骨骼的行为识别任务精度。
+
+我们提供了详尽理论及代码讲解,并可使用免费在线GPU算力资源,一键运行的AI Studio Notebook项目, 使用链接:[基于飞桨实现花样滑冰选手骨骼点动作识别大赛baseline](https://aistudio.baidu.com/aistudio/projectdetail/2417717?contributionType=1)
+
+
+
+
+
+
+## 数据准备
+
+花样滑冰比赛数据下载及准备请参考[花样滑冰数据准备](../../dataset/fsd.md)
+
+NTU-RGBD数据下载及准备请参考[NTU-RGBD数据准备](../../dataset/ntu-rgbd.md)
+
+
+## 模型训练
+
+### 花样滑冰数据集训练
+
+- 花样滑冰数据集使用单卡训练,启动命令如下:
+
+```bash
+python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml
+```
+
+- 由于赛事未提供验证集数据,因此训练时不做valid。
+
+- 您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,参数用法请参考[config](../../tutorials/config.md)。
+
+
+### NTU-RGBD数据集训练
+
+- NTU-RGBD数据集使用4卡训练,启动命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_stgcn main.py --validate -c configs/recognition/stgcn/stgcn_ntucs.yaml
+```
+
+- 配置文件`stgcn_ntucs.yaml`为NTU-RGB+D数据集按cross-subject划分方式对应的训练配置。
+
+
+## 模型测试
+
+### 花样滑冰数据集模型测试
+
+- 模型测试的启动命令如下:
+
+```bash
+python3.7 main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -w output/STGCN/STGCN_epoch_00090.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+- 评估结果保存在submission.csv文件中,可在[评测官网](https://aistudio.baidu.com/aistudio/competition/detail/115)提交查看得分。
+
+模型在花样滑冰数据集上baseline实验精度如下:
+
+Test_Data| Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| Test_A | 59.07 | [STGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |
+
+
+### NTU-RGB+D数据集模型测试
+
+- 模型测试的启动命令如下:
+
+```bash
+python3.7 main.py --test -c configs/recognition/stgcn/stgcn_ntucs.yaml -w output/STGCN/STGCN_best.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+模型在NTU-RGB+D数据集上实验精度如下:
+
+| split | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| cross-subject | 82.28 | [STGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_ntucs.pdparams) |
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml \
+ -p data/STGCN_fsd.pdparams \
+ -o inference/STGCN
+```
+
+上述命令将生成预测所需的模型结构文件`STGCN.pdmodel`和模型权重文件`STGCN.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
+ --config configs/recognition/stgcn/stgcn_fsd.yaml \
+ --model_file inference/STGCN/STGCN.pdmodel \
+ --params_file inference/STGCN/STGCN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.9912770986557007
+```
+
+可以看到,使用在花样滑冰数据集上训练好的ST-GCN模型对`data/example_skeleton.npy`进行预测,输出的top1类别id为`27`,置信度为0.9912770986557007。
+
+
+## 参考论文
+
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
diff --git a/docs/zh-CN/model_zoo/recognition/timesformer.md b/docs/zh-CN/model_zoo/recognition/timesformer.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae30f957493e36f48be82d264e41b31ff255b11c
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/timesformer.md
@@ -0,0 +1,136 @@
+[English](../../../en/model_zoo/recognition/timesformer.md) | 简体中文
+
+# TimeSformer视频分类模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+TimeSformer是基于vision transformer的视频分类模型,具有无卷积、全局感受野、时间序列建模能力强的特点。目前在Kinetics-400数据集上达到了SOTA精度,超过了经典的基于CNN的视频分类模型TSN和TSM以及Slowfast,而且具有更短的训练用时(Kinetics-400数据集训练用时39小时)。**本代码实现的是论文中的时间-空间分离的注意力级联网络**。
+
+
+


+
+
+
+## 数据准备
+
+K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
+
+UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
+
+
+## 模型训练
+
+### Kinetics-400数据集训练
+
+#### 下载并添加预训练模型
+
+1. 下载图像预训练模型[ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams)作为Backbone初始化参数,或通过wget命令下载
+
+ ```bash
+ wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+ ```
+
+2. 打开`PaddleVideo/configs/recognition/timesformer/timesformer_k400_videos.yaml`,将下载好的权重存放路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL:
+ framework: "RecognizerTransformer"
+ backbone:
+ name: "VisionTransformer"
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- Kinetics400数据集使用8卡训练,训练方式的启动命令如下:
+
+ ```bash
+ # videos数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+ ```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 # MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+ # videos数据格式
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --amp --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+ ```
+
+- 另外您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`,参数用法请参考[config](../../tutorials/config.md)。
+
+
+## 模型测试
+
+- TimeSformer模型在训练时同步进行验证,您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+ ```
+ Already save the best model (top1 acc)0.7258
+ ```
+
+- 由于TimeSformer模型测试模式的采样方式是速度稍慢但精度高一些的**UniformCrop**,与训练过程中验证模式采用的**RandomCrop**不同,所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数,因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标,命令如下:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --test -c configs/recognition/timesformer/timesformer_k400_videos.yaml -w "output/TimeSformer/TimeSformer_best.pdparams"
+ ```
+
+
+ 当测试配置采用如下参数时,在Kinetics-400的validation数据集上的测试指标如下:
+
+ | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+ | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
+ | Vision Transformer | UniformCrop | 8 | 224 | 77.29 | [TimeSformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |
+
+
+- 测试时,TimeSformer视频采样策略为使用Linspace采样:时序上,从待采样视频序列的第一帧到最后一帧区间内,均匀生成`num_seg`个稀疏采样点(包括端点);空间上,选择长边两端及中间位置(左中右 或 上中下)3个区域采样。1个视频共采样1个clip。
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml \
+ -p data/TimeSformer_k400.pdparams \
+ -o inference/TimeSformer
+```
+
+上述命令将生成预测所需的模型结构文件`TimeSformer.pdmodel`和模型权重文件`TimeSformer.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](../../start.md#2-模型推理)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/timesformer/timesformer_k400_videos.yaml \
+ --model_file inference/TimeSformer/TimeSformer.pdmodel \
+ --params_file inference/TimeSformer/TimeSformer.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9997474551200867
+```
+
+可以看到,使用在Kinetics-400上训练好的TimeSformer模型对`data/example.avi`进行预测,输出的top1类别id为`5`,置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`,可知预测类别名称为`archery`。
+
+## 参考论文
+
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
diff --git a/docs/zh-CN/model_zoo/recognition/tsm.md b/docs/zh-CN/model_zoo/recognition/tsm.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3591040a2bb3e6e4ceb76a4fa69c038a0b36306
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/tsm.md
@@ -0,0 +1,231 @@
+[English](../../../en/model_zoo/recognition/tsm.md) | 简体中文
+
+# TSM视频分类模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [实现细节](#实现细节)
+- [参考论文](#参考论文)
+
+## 模型简介
+
+Temporal Shift Module (TSM) 是当前比较受关注的视频分类模型,通过通道移动的方法在不增加任何额外参数量和计算量的情况下,极大地提升了模型对于视频时间信息的利用能力,并且由于其具有轻量高效的特点,十分适合工业落地。
+
+我们提供了详尽理论及代码讲解,并可使用免费在线GPU算力资源,一键运行的AI Studio Notebook项目,
+使用链接:[Paddle2.1实现视频理解经典模型-TSM](https://aistudio.baidu.com/aistudio/projectdetail/2310889?contributionType=1)
+
+
+
+
+
+
+
+本代码实现的模型为**基于单路RGB图像**的TSM网络,Backbone采用ResNet-50结构。
+
+详细内容请参考ICCV 2019年论文 [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf)
+
+## 数据准备
+
+Kinetics400数据下载及准备请参考[k400数据准备](../../dataset/k400.md)
+
+UCF101数据下载及准备请参考[ucf101数据准备](../../dataset/ucf101.md)
+
+## 模型训练
+
+### Kinetics-400数据集训练
+
+#### 下载并添加预训练模型
+
+1. 加载在ImageNet1000上训练好的ResNet50权重作为Backbone初始化参数[ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams),也可以通过命令行下载
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+ ```
+
+2. 打开`PaddleVideo/configs/recognition/tsm/tsm_k400_frames.yaml`,将下载好的权重路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTSM"
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- Kinetics400数据集使用8卡训练,frames格式数据的训练启动命令如下:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+ ```
+
+- Kinetics400数据集使用8卡训练,videos格式数据的训练启动命令如下:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_k400_videos.yaml
+ ```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 #MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+ ```
+
+- 使用amp混合精度训练时,配合`nhwc`的数据格式有更好的加速效果,其训练启动方式如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 #MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml
+ ```
+
+- 另外您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,配置文件命名方式为`模型_数据集_文件格式_数据格式.yaml`,具体参数用法请参考[config](../../tutorials/config.md)。
+
+
+
+### UCF-101数据集训练
+
+#### 下载并添加预训练模型
+
+1. 加载在Kinetics-400上训练好的权重作为Backbone初始化参数[TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams),也可以通过命令行下载
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams
+ ```
+
+2. 打开`PaddleVideo/configs/recognition/tsm/tsm_ucf101_frames.yaml`,将下载好的权重路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNetTSM"
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- UCF-101数据集使用4卡训练,frames格式数据的训练启动命令如下:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
+ ```
+
+- UCF-101数据集使用4卡训练,videos格式数据的训练启动命令如下:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_ucf101_videos.yaml
+ ```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 #MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
+ ```
+
+- 使用amp混合精度训练时,配合`nhwc`的数据格式有更好的加速效果,其训练启动方式如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 #MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames_nhwc.yaml
+ ```
+
+
+## 模型测试
+
+- TSM模型在训练时同步进行测试,您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+```txt
+Already save the best model (top1 acc)0.7106
+```
+
+- 若需单独运行测试代码,其启动命令如下:
+
+```bash
+python3.7 main.py --test -c configs/recognition/tsm/tsm_k400_frames.yaml -w output/TSM/TSM_best.pdparams
+```
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+---
+
+当测试配置采用如下参数时,在Kinetics-400的validation数据集上的评估精度如下:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :--------: | :---------------: | :-------: | :-----------: | :-----: | :-----------: | :-----------: |
+| ResNet50 | Uniform | NCHW | 8 | 224 | 71.06 | [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams) |
+
+当测试配置采用如下参数时,在UCF-101的validation数据集上的评估精度如下:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :-------------: | :-----------------: | :-----: | :---------: | :---: | :---------: |
+| ResNet50 | Uniform | NCHW | 8 | 224 | 94.42 | [TSM_ucf101_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_nchw.pdparams) |
+| ResNet50 | Uniform | NCHW+AMP | 8 | 224 | 94.40 | [TSM_ucf101_amp_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nchw.pdparams) |
+| ResNet50 | Uniform | NHWC+AMP | 8 | 224 | 94.55 | [TSM_ucf101_amp_nhwc.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nhwc.pdparams) |
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml \
+ -p data/TSM_k400.pdparams \
+ -o inference/TSM
+```
+
+上述命令将生成预测所需的模型结构文件`TSM.pdmodel`和模型权重文件`TSM.pdiparams`。
+
+各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/tsm/tsm_k400_frames.yaml \
+ --model_file inference/TSM/TSM.pdmodel \
+ --params_file inference/TSM/TSM.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+## 实现细节
+
+**数据处理**
+
+- 模型读取Kinetics-400数据集中的`mp4`数据,首先将每条视频数据划分成`num_seg`段,然后均匀地从每段中抽取1帧图像,得到稀疏采样的`num_seg`张视频帧,再对这`num_seg`帧图像做同样的随机数据增强,包括多尺度的随机裁剪、随机左右翻转、数据归一化等,最后缩放至`target_size`。
+
+**训练策略**
+
+- 采用Momentum优化算法训练,momentum=0.9
+- 采用L2_Decay,权重衰减系数为1e-4
+- 采用全局梯度裁剪,裁剪系数为20.0
+- 总epoch数为50,学习率在epoch达到20、40进行0.1倍的衰减
+- FC层的权重与偏置的学习率分别为为整体学习率的5倍、10倍,且偏置不设置L2_Decay
+- Dropout_ratio=0.5
+
+**参数初始化**
+
+- 以Normal(mean=0, std=0.001)的正态分布来初始化FC层的权重,以常数0来初始化FC层的偏置
+
+## 参考论文
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+
diff --git a/docs/zh-CN/model_zoo/recognition/tsn.md b/docs/zh-CN/model_zoo/recognition/tsn.md
new file mode 100644
index 0000000000000000000000000000000000000000..6564e26af5adc1aee47873d3bc23a24a06ae0902
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/tsn.md
@@ -0,0 +1,120 @@
+简体中文 | [English](../../../en/model_zoo/recognition/tsn.md)
+
+# TSN
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [实现细节](#实现细节)
+- [参考论文](#参考论文)
+
+## 模型简介
+
+Temporal Segment Network (TSN) 是视频分类领域经典的基于2D-CNN的解决方案。该方法主要解决视频的长时间行为识别问题,通过稀疏采样视频帧的方式代替稠密采样,既能捕获视频的全局信息,也能去除冗余,降低计算量。核心思想是将每帧的特征做平均融合作为视频的整体特征,再输入分类器进行分类。本代码实现的模型为**基于单路RGB图像**的TSN网络,Backbone采用ResNet-50结构。
+
+我们提供了详尽理论及代码讲解,并可使用免费在线GPU算力资源,一键运行的AI Studio Notebook项目,使用链接:[Paddle 2.1实现视频理解经典模型-TSN](https://aistudio.baidu.com/aistudio/projectdetail/2250682?contributionType=1)
+
+
+
+
+
+详细内容请参考ECCV 2016年的论文[Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
+
+## 数据准备
+
+PaddleVide提供了在Kinetics-400数据集上训练和测试练脚本。Kinetics-400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
+
+## 模型训练
+
+### Kinetics-400数据集训练
+
+#### 下载并添加预训练模型
+
+1. 加载在ImageNet1000上训练好的ResNet50权重作为Backbone初始化参数[ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams),也可以通过命令行下载
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+ ```
+
+2. 打开`PaddleVideo/configs/recognition/tsn/tsn_k400_frames.yaml`,将下载好的权重路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNet"
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- Kinetics-400数据集使用8卡训练,frames格式数据的训练启动命令如下
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py --validate -c configs/recognition/tsn/tsn_k400_frames.yaml
+ ```
+
+## 模型测试
+
+由于TSN模型测试模式的采样方式是速度稍慢但精度高一些的**TenCrop**,与训练过程中验证模式采用的**CenterCrop**不同,所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数,因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标,命令如下:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py --test -c configs/recognition/tsn/tsn_k400_frames.yaml -w "output/TSN/TSN_best.pdparams"
+```
+
+当测试配置采用如下参数时,在Kinetics-400的validation数据集上的测试指标如下:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :-------------: | :---------------: | :-----: | :---------: | :---: | ------------------------------------------------------------ |
+| ResNet50 | TenCrop | NCHW | 3 | 224 | 69.81 | [TSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |
+| ResNet50 | TenCrop | NCHW | 8 | 224 | 71.70 | [TSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400_8.pdparams) |
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml \
+ -p data/TSN_k400.pdparams \
+ -o inference/TSN
+```
+
+上述命令将生成预测所需的模型结构文件`TSN.pdmodel`和模型权重文件`TSN.pdiparams`。
+
+各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-模型推理)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/tsn/tsn_k400_frames.yaml \
+ --model_file inference/TSN/TSN.pdmodel \
+ --params_file inference/TSN/TSN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+## 实现细节
+
+**数据处理:**
+
+- 模型读取Kinetics-400数据集中的`mp4`数据,首先将每条视频数据划分成`num_seg`段,然后均匀地从每段中抽取1帧图像,得到稀疏采样的`num_seg`张视频帧,再对这`num_seg`帧图像做同样的随机数据增强,包括多尺度的随机裁剪、随机左右翻转、数据归一化等,最后缩放至`target_size`
+
+**训练策略:**
+
+- 采用Momentum优化算法训练,momentum=0.9
+- 采用L2_Decay,权重衰减系数为1e-4
+- 采用全局梯度裁剪,裁剪系数为40.0
+- 总epoch数为100,学习率在epoch达到40、80进行0.1倍的衰减
+- Dropout_ratio=0.4
+
+**参数初始化**
+
+- TSN模型的卷积层采用Paddle默认的[KaimingNormal](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/KaimingNormal_cn.html#kaimingnormal)和[Constant](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/Constant_cn.html#constant)初始化方法,以Normal(mean=0, std=0.01)的正态分布来初始化FC层的权重,以常数0来初始化FC层的偏置
+
+## 参考论文
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
diff --git a/docs/zh-CN/model_zoo/recognition/tsn_dali.md b/docs/zh-CN/model_zoo/recognition/tsn_dali.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9b2d1ed67a22994052fa435d653474501c0b0f2
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/tsn_dali.md
@@ -0,0 +1,111 @@
+[English](../../../en/model_zoo/recognition/tsn_dali.md) | 简体中文
+
+# TSN模型-DALI训练加速
+
+- [方案简介](#方案简介)
+- [环境配置](#环境配置)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考文献](#参考文献)
+
+## 方案简介
+训练速度慢是视频模型训练常见的问题,PaddleVideo使用飞桨2.0的dataloader接口进行数据读取,凭借其优异的多进程加速能力,模型的训练速度可以显著增加。TSN是视频领域常用的2D模型,我们对其训练速度进行了进一步优化。基于[nvidia DALI](https://github.com/NVIDIA/DALI)的GPU解码能力,我们对nvidia DALI进行了二次开发,实现了其均匀分段的帧采样方式,进一步提升了TSN模型的训练速度。
+
+### 性能
+
+测试环境:
+```
+机器: Tesla v100
+显存: 4卡16G
+Cuda: 9.0
+单卡batch_size: 32
+```
+
+训练速度对比如下:
+
+| 加速方式 | batch耗时/s | reader耗时/s | ips:instance/sec | 加速比 |
+| :--------------- | :--------: | :------------: | :------------: | :------------: |
+| DALI | 2.083 | 1.804 | 15.36597 | 1.41x |
+| Dataloader: 单卡num_workers=4 | 2.943 | 2.649 | 10.87460| base |
+| pytorch实现 | TODO | TODO | TODO | TODO |
+
+
+## 环境配置
+
+我们提供docker运行环境方便您使用,基础镜像为:
+
+```
+ huangjun12/paddlevideo:tsn_dali_cuda9_0
+```
+
+基于以上docker镜像创建docker容器,运行命令为:
+
+```bash
+nvidia-docker run --name tsn-DALI -v /home:/workspace --network=host -it --shm-size 64g -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video huangjun12/paddlevideo:tsn_dali_cuda9_0 /bin/bash
+```
+- docker中安装好了飞桨2.0.0-rc1版本和我们二次开发后的DALI,创建容器后您可以在docker环境中直接开始tsn模型训练,无需额外配置环境。
+
+## 数据准备
+
+PaddleVide提供了在K400和UCF101两种数据集上训练TSN的训练脚本。
+
+- K400数据下载及准备请参考[K400数据准备](../../dataset/k400.md)
+
+- UCF101数据下载及准备请参考[UCF101数据准备](../../dataset/ucf101.md)
+
+## 模型训练
+
+### 预训练模型下载
+
+加载在ImageNet1000上训练好的ResNet50权重作为Backbone初始化参数,请下载此[模型参数](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams),
+或是通过命令行下载
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+```
+
+并将路径添加到configs中backbone字段下
+
+```yaml
+MODEL:
+ framework: "Recognizer2D"
+ backbone:
+ name: "ResNet"
+ pretrained: 将路径填写到此处
+```
+
+### 开始训练
+
+模型训练的启动命令为:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml -o log_level="INFO"
+```
+
+- 通过`-c`指定模型训练参数配置文件,模型及训练参数配置请参考配置文件```configs/recognition/tsn/tsn_dali.yaml```。
+
+- 如若进行finetune,请下载PaddleVideo的已发布模型[comming soon](), 通过`--weights`指定权重存放路径可进行模型finetune。
+
+- 您可以自定义修改参数配置,参数用法请参考[config](../../tutorials/config.md)。
+
+## 模型测试
+
+模型测试方法请参考TSN模型使用文档[模型测试部分](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn.md#模型测试)
+
+## 模型推理
+
+模型推理方法请参考TSN模型使用文档[模型推理部分](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn.md#模型推理)
+
+## 参考论文
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
+
+
+
+
+
+
+
+
diff --git a/docs/zh-CN/model_zoo/recognition/videoswin.md b/docs/zh-CN/model_zoo/recognition/videoswin.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d6541c1e7a7fec4d4f7ba6a6df612713d0586d8
--- /dev/null
+++ b/docs/zh-CN/model_zoo/recognition/videoswin.md
@@ -0,0 +1,130 @@
+[English](../../../en/model_zoo/recognition/videoswin.md) | 简体中文
+
+# Video-Swin-Transformer视频分类模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+Video-Swin-Transformer是基于Swin Transformer的视频分类模型,其利用了Swin Transformer的多尺度建模和高效局部注意力特性,目前在Kinetics-400数据集上达到了SOTA精度,超过了同为transformer结构的TimeSformer模型。
+
+
+
+
+## 数据准备
+
+K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
+
+
+## 模型训练
+
+### Kinetics-400数据集训练
+
+#### 下载并添加预训练模型
+
+1. 下载图像预训练模型[SwinTransformer_imagenet.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SwinTransformer_imagenet.pdparams)作为Backbone初始化参数,或通过wget命令下载
+
+ ```bash
+ wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SwinTransformer_imagenet.pdparams
+ ```
+
+2. 打开`configs/recognition/videoswin/videoswin_k400_videos.yaml`,将下载好的权重存放路径填写到下方`pretrained:`之后
+
+ ```yaml
+ MODEL:
+ framework: "RecognizerTransformer"
+ backbone:
+ name: "SwinTransformer3D"
+ pretrained: 将路径填写到此处
+ ```
+
+#### 开始训练
+
+- Kinetics400数据集使用8卡训练,训练方式的启动命令如下:
+
+ ```bash
+ # videos数据格式
+ python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin main.py --validate -c configs/recognition/videoswin/videoswin_k400_videos.yaml
+ ```
+
+- 开启amp混合精度训练,可加速训练过程,其训练启动命令如下:
+
+ ```bash
+ export FLAGS_conv_workspace_size_limit=800 # MB
+ export FLAGS_cudnn_exhaustive_search=1
+ export FLAGS_cudnn_batchnorm_spatial_persistent=1
+ # videos数据格式
+ python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin main.py --amp --validate -c configs/recognition/videoswin/videoswin_k400_videos.yaml
+ ```
+
+- 另外您可以自定义修改参数配置,以达到在不同的数据集上进行训练/测试的目的,建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`,参数用法请参考[config](../../tutorials/config.md)。
+
+
+## 模型测试
+
+- Video-Swin-Transformer模型在训练时同步进行验证,您可以通过在训练日志中查找关键字`best`获取模型测试精度,日志示例如下:
+
+ ```
+ Already save the best model (top1 acc)0.7258
+ ```
+
+- 由于Video-Swin-Transformer模型测试模式的采样方式是速度稍慢但精度高一些的**UniformCrop**,与训练过程中验证模式采用的**CenterCrop**不同,所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数,因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标,命令如下:
+
+ ```bash
+ python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin main.py --test -c configs/recognition/videoswin/videoswin_k400_videos.yaml -w "output/VideoSwin/VideoSwin_best.pdparams"
+ ```
+
+
+ 当测试配置采用如下参数时,在Kinetics-400的validation数据集上的测试指标如下:
+
+ | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+ | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
+ | Swin Transformer | UniformCrop | 32 | 224 | 82.40 | [SwinTransformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams) |
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/videoswin/videoswin_k400_videos.yaml \
+ -p data/VideoSwin_k400.pdparams \
+ -o inference/VideoSwin
+```
+
+上述命令将生成预测所需的模型结构文件`VideoSwin.pdmodel`和模型权重文件`VideoSwin.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](../../start.md#2-模型推理)
+
+### 使用预测引擎推理
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+ --config configs/recognition/videoswin/videoswin_k400_videos.yaml \
+ --model_file inference/VideoSwin/VideoSwin.pdmodel \
+ --params_file inference/VideoSwin/VideoSwin.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9999829530715942
+```
+
+可以看到,使用在Kinetics-400上训练好的Video-Swin-Transformer模型对`data/example.avi`进行预测,输出的top1类别id为`5`,置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`,可知预测类别名称为`archery`。
+
+## 参考论文
+
+- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei
diff --git a/docs/zh-CN/model_zoo/segmentation/asrf.md b/docs/zh-CN/model_zoo/segmentation/asrf.md
new file mode 100644
index 0000000000000000000000000000000000000000..8394916d81352b7ab2c714be4dc8f4ce4d65a5f4
--- /dev/null
+++ b/docs/zh-CN/model_zoo/segmentation/asrf.md
@@ -0,0 +1,142 @@
+[English](../../../en/model_zoo/segmentation/asrf.md) | 简体中文
+
+# ASRF 视频动作分割模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+## 模型简介
+
+ASRF模型是在视频动作分割模型MS-TCN上的改进,发表在2021年的WACV上。我们对官方实现的pytorch代码进行复现,在PaddleVideo获得了近似的结果。
+
+
+
+ASRF Overview
+
+
+## 数据准备
+
+ASRF的训练数据可以选择50salads、breakfast、gtea三个数据集,数据下载及准备请参考[视频动作分割数据集](../../dataset/SegmentationDataset.md)
+
+不同于MS-TCN,ASRF模型需要额外的数据构建,脚本流程如下
+```bash
+python data/50salads/prepare_asrf_data.py --dataset_dir data/
+```
+
+## 模型训练
+
+数据准备完毕后,可以通过如下方式启动训练:
+
+```bash
+# gtea数据集
+export CUDA_VISIBLE_DEVICES=3
+python3.7 main.py --validate -c configs/segmentation/asrf/asrf_gtea.yaml --seed 1538574472
+```
+
+- 从头开始训练,使用上述启动命令行或者脚本程序即可启动训练,不需要用到预训练模型,视频动作分割模型通常为全卷积网络,由于视频的长度不一,故视频动作分割模型的batch_size字段通常设为1,即不需要批量训练,目前也仅支持**单样本**训练
+
+## 模型测试
+
+可通过如下方式进行模型测试:
+
+```bash
+python main.py --test -c configs/segmentation/asrf/asrf_gtea.yaml --weights=./output/ASRF/ASRF_split_1.pdparams
+```
+
+- 指标的具体实现是参考MS-TCN作者[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py)提供的测试脚本,计算Acc、Edit和F1分数。
+
+- pytorch的复现来源于官方提供的[代码库](https://github.com/yiskw713/asrf)
+
+- 数据集的评估方法采用MS-TCN论文中的折交验证方法,而折交的划分方式与MS-TCN论文中相同。
+
+在Breakfast数据集下评估精度如下(采用4折交验证):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 67.6% | 72.4% | 74.3% | 68.9% | 56.1% |
+| pytorch | 65.8% | 71.0% | 72.3% | 66.5% | 54.9% |
+| paddle | 66.1% | 71.9% | 73.3% | 67.9% | 55.7% |
+
+在50salads数据集下评估精度如下(采用5折交验证):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 84.5% | 79.3% | 82.9% | 83.5% | 77.3% |
+| pytorch | 81.4% | 75.6% | 82.7% | 81.2% | 77.2% |
+| paddle | 81.6% | 75.8% | 83.0% | 81.5% | 74.8% |
+
+在gtea数据集下评估精度如下(采用4折交验证):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 77.3% | 83.7% | 89.4% | 87.8% | 79.8% |
+| pytorch | 76.3% | 79.6% | 87.3% | 85.8% | 74.9% |
+| paddle | 77.1% | 83.3% | 88.9% | 87.5% | 79.1% |
+
+给出在gtea数据集下的折交的模型权重
+
+Test_Data| F1@0.5 | checkpoints |
+| :----: | :----: | :---- |
+| gtea_split1 | 72.4409 | [ASRF_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_1.pdparams) |
+| gtea_split2 | 76.6666 | [ASRF_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_2.pdparams) |
+| gtea_split3 | 84.5528 | [ASRF_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_3.pdparams) |
+| gtea_split4 | 82.6771 | [ASRF_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_4.pdparams) |
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/segmentation/asrf/asrf_gtea.yaml \
+ -p data/ASRF_gtea_split_1.pdparams \
+ -o inference/ASRF
+```
+
+上述命令将生成预测所需的模型结构文件`ASRF.pdmodel`和模型权重文件`ASRF.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+输入预测模型的txt文件为需要预测的文件列表,如:
+```
+S1_Cheese_C1.npy
+S1_CofHoney_C1.npy
+S1_Coffee_C1.npy
+S1_Hotdog_C1.npy
+...
+```
+
+```bash
+python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
+ --config configs/segmentation/asrf/asrf_gtea.yaml \
+ --model_file inference/ASRF/ASRF.pdmodel \
+ --params_file inference/ASRF/ASRF.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```bash
+result write in : ./inference/infer_results/S1_Cheese_C1.txt
+result write in : ./inference/infer_results/S1_CofHoney_C1.txt
+result write in : ./inference/infer_results/S1_Coffee_C1.txt
+result write in : ./inference/infer_results/S1_Hotdog_C1.txt
+result write in : ./inference/infer_results/S1_Pealate_C1.txt
+result write in : ./inference/infer_results/S1_Peanut_C1.txt
+result write in : ./inference/infer_results/S1_Tea_C1.txt
+```
+
+
+## 参考论文
+
+- [Alleviating Over-segmentation Errors by Detecting Action Boundaries](https://arxiv.org/pdf/2007.06866v1.pdf), Yuchi Ishikawa, Seito Kasai, Yoshimitsu Aoki, Hirokatsu Kataoka
diff --git a/docs/zh-CN/model_zoo/segmentation/cfbi.md b/docs/zh-CN/model_zoo/segmentation/cfbi.md
new file mode 100644
index 0000000000000000000000000000000000000000..fac110bce3cbaee3a47c15df712e431814f89c33
--- /dev/null
+++ b/docs/zh-CN/model_zoo/segmentation/cfbi.md
@@ -0,0 +1,49 @@
+[English](../../../en/model_zoo/segmentation/cfbi.md) | 简体中文
+
+# CFBI视频分割模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型测试](#模型测试)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+CFBI是百度在ECCV 2020提出的视频目标分割模型,该模型基于前背景整合的协作式方法,将前景目标对象与背景对象的嵌入特征进行对比,从而提升视频分割的效果。给定参考帧(第一帧)和前一帧的图像和目标分割,模型会预测出当前帧的分割。
+
+
+
+
+
+
+## 数据准备
+
+DAVIS数据下载及准备请参考[DAVIS数据准备](../../../../applications/Ma-Net/dataloaders/DAVIS2017_cn.md)
+
+
+## 模型测试
+
+- 测试启动脚本如下:
+
+```bash
+python3.7 main.py --test -c configs/segmentation/cfbip_davis.yaml -w CFBIp_davis.pdparams
+```
+
+- 通过`-c`参数指定配置文件,通过`-w`指定权重存放路径进行模型测试。
+
+- 运行上述命令,会将结果保存在配置文件中指定的`result_root`下,获取数值评估指标,请使用[davis2017-evaluation工具](https://github.com/davisvideochallenge/davis2017-evaluation)。
+
+DAVIS数据集测试精度:
+
+| J&F-Mean | J-Mean | J-Recall | J-Decay | F-Mean | F-Recall | F-Decay | checkpoints |
+| :------: | :-----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| 0.823 | 0.793 | 0.885 | 0.083 | 0.852 | 0.932 | 0.100 | [CFBIp_r101_davis.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/CFBIp_r101_davis.pdparams) |
+
+
+## 参考论文
+
+- [Collaborative Video Object Segmentation by Foreground-Background Integration](https://arxiv.org/abs/2003.08333), Zongxin Yang, Yunchao Wei, Yi Yang
diff --git a/docs/zh-CN/model_zoo/segmentation/mstcn.md b/docs/zh-CN/model_zoo/segmentation/mstcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..fac5b7b8764be40801e2c2ba3be0fd38f429a888
--- /dev/null
+++ b/docs/zh-CN/model_zoo/segmentation/mstcn.md
@@ -0,0 +1,131 @@
+[English](../../../en/model_zoo/segmentation/mstcn.md) | 简体中文
+
+# MS-TCN 视频动作分割模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [参考论文](#参考论文)
+
+## 模型简介
+
+MS-TCN模型是视频动作分割模型的经典的模型,发表在2019年的CVPR上。我们对官方实现的pytorch代码进行一些优化,在PaddleVideo获得了更高精度的结果。
+
+
+
+MS-TCN Overview
+
+
+## 数据准备
+
+MS-TCN的训练数据可以选择50salads、breakfast、gtea三个数据集,数据下载及准备请参考[视频动作分割数据集](../../dataset/SegmentationDataset.md)
+
+## 模型训练
+
+数据准备完毕后,可以通过如下方式启动训练:
+
+```bash
+# gtea数据集
+export CUDA_VISIBLE_DEVICES=3
+python3.7 main.py --validate -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --seed 1538574472
+```
+
+- 从头开始训练,使用上述启动命令行或者脚本程序即可启动训练,不需要用到预训练模型,视频动作分割模型通常为全卷积网络,由于视频的长度不一,故视频动作分割模型的batch_size字段通常设为1,即不需要批量训练,目前也仅支持**单样本**训练
+
+## 模型测试
+
+可通过如下方式进行模型测试:
+
+```bash
+python main.py --test -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --weights=./output/MSTCN/MSTCN_split_1.pdparams
+```
+
+- 指标的具体实现是参考MS-TCN作者[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py)提供的测试脚本,计算Acc、Edit和F1分数。
+
+- 数据集的评估方法采用MS-TCN论文中的折交验证方法,而折交的划分方式与MS-TCN论文中相同。
+
+在Breakfast数据集下评估精度如下(采用4折交验证):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 66.3% | 61.7% | 48.1% | 48.1% | 37.9% |
+| paddle | 65.2% | 61.5% | 53.7% | 49.2% | 38.8% |
+
+在50salads数据集下评估精度如下(采用5折交验证):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 80.7% | 67.9% | 76.3% | 74.0% | 64.5% |
+| paddle | 81.1% | 71.5% | 77.9% | 75.5% | 66.5% |
+
+在gtea数据集下评估精度如下(采用4折交验证):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 79.2% | 81.4% | 87.5% | 85.4% | 74.6% |
+| paddle | 76.9% | 81.8% | 86.4% | 84.7% | 74.8% |
+
+给出在gtea数据集下的折交的模型权重
+
+Test_Data| F1@0.5 | checkpoints |
+| :----: | :----: | :---- |
+| gtea_split1 | 70.2509 | [MSTCN_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_1.pdparams) |
+| gtea_split2 | 70.7224 | [MSTCN_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_2.pdparams) |
+| gtea_split3 | 80.0 | [MSTCN_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_3.pdparams) |
+| gtea_split4 | 78.1609 | [MSTCN_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_4.pdparams) |
+
+
+## 模型推理
+
+### 导出inference模型
+
+```bash
+python3.7 tools/export_model.py -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
+ -p data/MSTCN_gtea_split_1.pdparams \
+ -o inference/MSTCN
+```
+
+上述命令将生成预测所需的模型结构文件`MSTCN.pdmodel`和模型权重文件`MSTCN.pdiparams`。
+
+- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### 使用预测引擎推理
+
+输入预测模型的txt文件为需要预测的文件列表,如:
+```
+S1_Cheese_C1.npy
+S1_CofHoney_C1.npy
+S1_Coffee_C1.npy
+S1_Hotdog_C1.npy
+...
+```
+
+```bash
+python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
+ --config configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
+ --model_file inference/MSTCN/MSTCN.pdmodel \
+ --params_file inference/MSTCN/MSTCN.pdiparams \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+输出示例如下:
+
+```bash
+result write in : ./inference/infer_results/S1_Cheese_C1.txt
+result write in : ./inference/infer_results/S1_CofHoney_C1.txt
+result write in : ./inference/infer_results/S1_Coffee_C1.txt
+result write in : ./inference/infer_results/S1_Hotdog_C1.txt
+result write in : ./inference/infer_results/S1_Pealate_C1.txt
+result write in : ./inference/infer_results/S1_Peanut_C1.txt
+result write in : ./inference/infer_results/S1_Tea_C1.txt
+```
+
+## 参考论文
+
+- [MS-TCN: Multi-Stage Temporal Convolutional Network for Action Segmentation](https://arxiv.org/pdf/1903.01945.pdf), Y. Abu Farha and J. Gall.
diff --git a/docs/zh-CN/tools.md b/docs/zh-CN/tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..50d5c7e47cb6261d70bac312211eb88675a2210c
--- /dev/null
+++ b/docs/zh-CN/tools.md
@@ -0,0 +1,19 @@
+简体中文 | [English](../en/tools.md)
+
+# 小工具
+
+这篇文档主要介绍PaddleVideo的一些小工具
+
+## 统计 Params
+
+```shell
+python3.7 tools/summary.py -c configs/recognition/tsm/tsm.yaml
+```
+
+## 统计FLOPS
+
+```shell
+python3.7 tools/summary.py -c configs/recognition/tsm/tsm.yaml --FLOPs
+```
+
+## 测试导出模型 coming soon
diff --git a/docs/zh-CN/tutorials/I3D.md b/docs/zh-CN/tutorials/I3D.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd2c32a540cefcee43637a2faf87217e14c75ba4
--- /dev/null
+++ b/docs/zh-CN/tutorials/I3D.md
@@ -0,0 +1,90 @@
+# I3D
+
+## 简介
+本文提出了一种基于 2D-ConvNet 扩展的双流膨胀模型 I3D,作者将图像分类的 2D 卷积网络的滤波器和池化核扩展到 3D 中,使得从视频中学习无缝时空特征提取器成为可能。
+
+## 重点贡献
+* 提出了 Kinetics 数据集
+* 提出了双流 3D 卷积模型
+
+## kinetics数据集
+Kinetics 数据集有 400 个人体动作类别,每个类别有 400 多个视频片段,这些数据来自真实有挑战的 YouTube 视频。数据集包括的动作大类别有:
+1. 单人动作:绘画、饮酒、笑
+2. 人与人之间的动作:拥抱、亲吻、握手
+3. 人与物之间的动作:打开礼物、洗碗、除草
+4. 需要细分的动作,比如不同类型的游泳
+5. 侧重于物体的信息,比如不同类型的乐器
+
+## 动机
+图像领域有一个超大规模的 ImageNet 数据集,很多图像任务采用的都是 ImageNet 预训练模型,并且取得了不错了效果。在视频领域中,如果我们有一个超大规模的数据集,将在该数据集上预训练好的的动作分类模型应用到其他时序任务或不同的数据集上是否会有类似性能的提升。为了验证这个猜想,作者将在 Kinetics 上的预训练模型应用到 HMDB-51 和 UCF-101 这种小的数据集上。实验结果表明,性能总是会得到提升,提升的程度与模型的结构有关。
+
+基于此发现,作者提出了 I3D,基于 InceptionV1 的 I3D 模型在经过 Kinetics 预训练后,其性能远远超过了当时最先进的水平。
+
+## 主要工作
+1. 在 Kinetics 数据集上做模型预训练,将预训练模型应用到 HMDB-51 和 UCF101 数据集上,验证大规模视频数据的有效性;
+2. 基于 2D-ConvNet,提出了新的行为识别模型 I3D;
+
+## 行为识别方法分析
+当前,行为识别模型主要的不同点:
+1. 卷积和层运算使用的是 2D 核还是 3D 核;
+2. 网络的输入仅仅包含的是 RGB 视频还是也包括预计算的光流;
+3. 在 2D-ConvNet 情况下,帧之间的信息如何传播;
+
+## 模型分析
+作者比较和研究了一些模型,这些模型有的基于 2D-ConvNet,有的基于 3D-ConvNet。之前基于 3D-ConvNet 的模型由于可用的训练数据少,网络结构相对较浅。于是本文将非常深的 2D-ConvNet 图像分类网络膨胀为 3D-ConvNet 的时空特征提取网络,同时将其作为 two-stream 框架的主干网络。由于之前的 2D-ConvNet 网络本身比较深,又可以使用 2D-ConvNet 的参数初始化相应 3D-ConvNet 的网络,因此可以解决之前训练数据不足的问题。
+
+这里作者分析五种网络结构,如下图所示。
+
+
+
网络结构
+
+
+### The Old I: ConvNet+LSTM
+将图像分类模型应用到视频分析上的一个直接想法是,把视频中的每帧看作一张独立的图像,提取每张图像后,对整个视频求均值。但这样做完全忽略了视频中的时序信息,一个比较合理的方法是在网络的末端添加一个 LSTM 层,用于学习视频帧之间的时序关系。因此 ConvNet+LSTM 的文章在 InceptionV1 的最后一个 average-pooling 后面加了一个包含 512 个隐含节点的 LSTM,最后接了一个用于分类的全连接层。
+
+### The Old 2 :3D ConvNets
+3D-ConvNets 是建模视频任务一个很自然的想法,与标准的卷积网络相比,增加了一个时空维度。由于时空维度的增加,使得 3D-ConvNets 比 2D-ConvNets 有更多的参数,增加了网络训练的困难。此外,网络结构是 3D 的,无法直接复用 2D 模型的参数。
+
+### The Old III: Two-Stream Networks
+ConvNet+LSTM 的模型结构仅仅捕获高层信息的变化,对于帧和帧之间在底层动作信息的捕获是不够的,并且底层动作信息在行为识别中是非常重要的。于是一些研究人员提出了 Two-Stream 网络,Two-Stream 分为两路,一路用于提取 RGB 信息,一路用于提取光流信息;这样的网络设计对空间维度和时间维度的提取都比较好。这种方法比较容器训练和测试,并且在公开数据集上取得了比较不错的效果。
+
+> Two-Stream 中的两个模型是分开训练的。
+
+### The New: Two-Stream Inflated 3D ConvNets
+#### 1 inflating 2D ConvNets into 3D
+把在 ImageNet 上表现好的 2D 模型直接扩展为 3D 模型,具体做法是将 2D 结构中的二维卷积核与池化核扩展一维,由之前的
变成
。
+
+#### Bootstrapping 3D filters from 2D Filters
+作者将一帧图像沿着时间轴复制 N 次,将其变为一个 boring video。为了保证在这个 boring video 上做卷积操作后池化激活与原始图像经过卷积操作后的池化激活相同,这里用到的方法是将 2D 卷积核
在时间维度上重复 N 次,得到
,之后再除以 N 的方式,确保滤波器的响应是相同的。
+
+#### Pacing receptive field growth in space,time and network depth
+将 2D-ConvNet 扩展到 3D-ConvNet 后,如何设置时间维度上的 kernel。目前几乎所有的图像相关的模型都平等的看待空间维度中的水平和垂直两个方向,两个方向上的 kernel 是相等的。当加入时间维度后,再使用完全对称的感受野并不是最好的选择,应该考虑帧速率和图像尺寸。
+* 如果时间维度比空间维度增长过快,可能会影响物体边缘信息,从而破坏物体的特征检测;
+* 如果时间维度比空间维度增长过慢,可能无法很好的捕捉场景的动态信息,从而影响对动作的检测;
+
+因此,作者对 InceptinV1 进行扩展时,大多数保持了对称特征,如第一个卷积核由
变成了
,stride 也从原来的 (2,2) 变成了 (2,2,2);只对少数做了改变,如前面两个 max-pool,并不是
,而是
,这样能够比较好的保留时间维度的信息,以及最后的 avg-pool 不是
而是
。
+
+
+
网络扩展
+
+
+
+#### Two 3D Streams
+虽然,3D-ConvNet 已经能够比较好的提取视频中的动作特征,但带有光流的 two-stream 结构对动作识别依然有巨大的帮助。因此作者将 3D-ConvNet 设计成 two-stream 形式,训练时左右两个网络分开训练,预测时对两个网络的预测结果做均值。
+
+## 实验结果
+在 UCF-101,HMDB-51 或 Kinetics 上进行训练和测试时的分类准确度。
+
+
+
实验结果1
+
+
+从 ImageNet 预训练或没有进行预训练模型在 Kinetics 上的表现。
+
+
+
实验结果2
+
+
+
+## 参考
+[Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset](https://arxiv.org/abs/1705.07750)
diff --git a/docs/zh-CN/tutorials/SlowFast.md b/docs/zh-CN/tutorials/SlowFast.md
new file mode 100644
index 0000000000000000000000000000000000000000..b80e8d12d3a5a696f5690be3db77e483c0e9f81d
--- /dev/null
+++ b/docs/zh-CN/tutorials/SlowFast.md
@@ -0,0 +1,172 @@
+# SlowFast
+
+## 背景
+SlowFast 由 Facebook FAIR 的何恺明团队提出,用于视频识别。SlowFast 包含两条路径:
+* Slow pathway
+* Fast pathway
+
+Slow pathway 运行低帧率,用于捕捉空间语义信息;Fast pathway 运行高帧率,获取精确的时间运动信息。通过降低通道数量,Fast pathway 分支可以变成轻量的网络,同时也能够学到视频中有用的时域信息。SlowFast 在没有任何预训练的情况下,在 Kinetics 数据集上的准确率达到了 79.0%。
+
+## 动机
+SlowFast 受到灵长类视觉系统中视网膜神经节细胞的生物学研究的启发。研究发现,这些细胞中约80%的都是P-cell,约15~20% 是 M-cell。M-cell 以较高的时间频率工作,能够对快速的时间变化作出响应,但是对空间细节和颜色不敏感。P-cell 则提供良好的空间细节和颜色信息,但时间分辨率较低,对刺激反应比较慢。
+
+SlowFast 与此相似:
+* SlowFast 有两条路径,分别处理低帧率和高帧率;
+* Fast pathway 用于捕捉快速变化的动作,单涉及到的细节信息较少,与M-cell类似;
+* Fast pathway 是轻量的,与M-cell的占比类似。
+
+## 简介
+在图像识别领域,对称处理图像 I(x,y) 中两个空间维度 x 和 y 是常见的做法,自然图像的统计也证明了其合理性。这是由于自然图像具有第一近似各向同性(所有方向具有相同的可能性)和平移不变性。但对于视频信号 I(x,y,t)来说,并不是所有的时空方向都有相同的可能性。因此不能像时空卷积那样对称地处理空间和时间。此时需要将网络结构分开,分别处理空间结构和时间事件。
+
+视觉内容的类别空间语义变化通常比较缓慢。比如,挥手不会在这个动作进行期间改变“手”的类别;一个人从走路变为跑步,识别结果也一直是“人”。因此类别语义的识别(以及颜色、纹理、光照等)可以以较慢的速度刷新。另一方面,正在执行的动作比其主体识别变化的速度要快得多,如拍手、挥手、摇摆、走路或跳跃。因此需要用较快的帧率刷新(高时间分辨率),来对快速变化的动作进行建模。
+
+## 思路
+基于上述想法作者提出了一种用于视频识别的双路径模型 SlowFast 。
+
+
+
+网络结构
+
+
+
+如上图所示,一条路径用于捕获图像或稀疏帧提供的语义信息,以低帧率运行,刷新速度慢。另一条路径用于捕获快速变化的动作,刷新速度快、时间分辨率高,该路径是轻量级的,仅占整体计算量的20%。这是由于这条路径通道较少,处理空间信息的能力较差,但空间信息可以由第一个路径以简洁的方式来处理。
+
+依据两条路径运行的帧率高低不同,作者将第一条路径称为“Slow pathway”;第二条路径称为“Fast pathway”;两条路径通过横向连接进行融合。
+
+## SlowFast
+SlowFast 的两个分支以不同的速率运行,作者通过使用两个分支模拟生物学上的大小细胞。
+
+### Slow Pathway
+Slow pathway 可以是任意在视频片段上做时空卷积的模型,如时空残差网络,C3D,I3D,Non-local网络等。Slow pathway 的关键之处在于对视频帧进行采样时,时间步长
较大,也就是说,只处理
帧中的一帧。这里,作者建议
的取值为 16,对于 30fps 的视频,差不多每秒采样 2 帧。如果 Slow pathway 采样的帧数是 T,那么原始视频片段的长度为
。
+
+### Fast Pathway
+#### 高帧率
+Fast pathway 的目的为了在时间维度上有良好的特征表示,Fast pathway 的时间步长
较小,其中
是 Slow pathway 和 Fast pathway 之间帧率比,作者建议
的取值为 8。由于两条路径在同一个视频上进行操作,因此 Fast pathway 采样到的帧数量为
,比 Slow pathway 密集
倍。
+
+#### 高时间分辨率特征
+Fast pathway 具有高输入分辨率,同时整个网络结构会运行高分辨率特征。在最后的分类全局池化层之前作者没有采用时间下采样层,因此在特征张量在时间维度上一直保持在
帧。
+
+#### 低通道容量
+Fast pathway 是一个与 Slow pathway 相似的卷积网络,但通道数只有 Slow pathway 的
倍,其中
,作者建议
的取值为
。这是的 Fast pathway 比 Slow pathway 的计算更高效。
+
+低通道容量可以理解为表示空间语义信息的能力较弱。由于 Fast pathway 的通道数更少,因此 Fast pathway 的空间建模能力应该弱于 Slow pathway。但 SlowFast 的实验结果表明这反而是有利的,它弱化了空间建模能力,却增强了时间建模能力。
+
+### 横向连接
+作者通过横向连接对两条路径的信息进行融合,使得 Slow pathway 知道 Fast pathway 在学习什么。作者在两条路径中的每个“阶段”上使用一个横向连接,由于两条路径的时间维度不同,因此在进行横向连接时需要通过变换对两条路径的维度进行匹配。最后,将两条路径的输出进行全局平均池化,并将池化后的特征拼接在一起作为全连接分类器层的输入。
+
+### 实例化
+SlowFast 模型的思想是通用的,可以用不同的主干网络来实现。如下图所示是一个 SlowFast 实例化的例子,其中黄色是通道数量,绿色是时序帧分辨率。
+
+
+
+实例化
+
+
+作者用
表示时空尺度,其中 T 是时间长度,S 是正方形裁剪区域的宽和高。
+
+#### Slow Pathway
+Slow pathway 是一个具有时间步长的 3D ResNet,网络时间维度的输入帧数 T = 4,从 64 帧视频片段中稀疏采样得到,时间步长
。作者没有采用时间下采样在实例化中,由于当输入步长较大时,这样做是有害的。
+
+Slow pathway 与 C3D/I3D 模型不同,从 conv_1 到 res_3 的滤波器本质上都是2D卷积核,只有 res_4 和 res_5 使用的是非退化时间卷积。之所以采用这种设计是由于作者通过实验发现,在早期层使用时间卷积会降低准确率。作者认为是由于当物体快速移动且时间步长较大时,在一个时间感受野内的相关性就很小,除非空间感受野也足够地大。
+
+#### Fast Pathway
+Fast pathway 的时间分辨率较高,通道容量较低。Fast pathway 的每个模块中都使用了非退化时间的卷积,并且没有使用时间下采样层。之所以这样设计是因为作者发现 Fast pathway 的时间卷积有很好的时间分辨率,可以捕捉细节动作。
+
+#### 横向连接
+横向连接是从 Fast pathway 到 Slow pathway,在融合之前需要保证两个维度是匹配的,Slow pathway 的特征维度是
,Fast pathway 的特征维度是
,在连接方案上作者进行了如下实验:
+* Time-to-channel:对
进行变形和转置,得到
,也就是说将所有的
帧放入一帧的多个通道内。
+* Time-strided sampling:每
帧,采样一帧,所以
就变成了
。
+* Time-strided convolution:使用 3D 卷积,卷积核大小是
,输出通道数为
,步长为
。
+
+## PaddleVideo
+PaddleVideo 关于采样的核心代码
+```python
+class PackOutput(object):
+ """
+ In slowfast model, we want to get slow pathway from fast pathway based on
+ alpha factor.
+ Args:
+ alpha(int): temporal length of fast/slow
+ """
+ def __init__(self, alpha):
+ self.alpha = alpha
+
+ def __call__(self, results):
+ fast_pathway = results['imgs']
+
+ # sample num points between start and end
+ slow_idx_start = 0
+ slow_idx_end = fast_pathway.shape[0] - 1
+ slow_idx_num = fast_pathway.shape[0] // self.alpha # slow 的采样数量
+ # 在区间[slow_idx_start, slow_idx_end] 内均匀采样
+ slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,
+ slow_idx_num).astype("int64")
+ slow_pathway = fast_pathway[slow_idxs_select] # 取出采样到的图片
+
+ # T H W C -> C T H W.
+ slow_pathway = slow_pathway.transpose(3, 0, 1, 2) # 对维度做转换
+ fast_pathway = fast_pathway.transpose(3, 0, 1, 2)
+
+ # slow + fast
+ frames_list = [slow_pathway, fast_pathway]
+ results['imgs'] = frames_list
+ return results
+```
+
+PaddleVideo 中关于特征融合的核心代码
+```python
+class FuseFastToSlow(paddle.nn.Layer):
+ """
+ Fuses the information from the Fast pathway to the Slow pathway. Given the
+ tensors from Slow pathway and Fast pathway, fuse information from Fast to
+ Slow, then return the fused tensors from Slow and Fast pathway in order.
+ """
+ def __init__(self,
+ dim_in,
+ fusion_conv_channel_ratio,
+ fusion_kernel,
+ alpha,
+ eps=1e-5,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ Args:
+ dim_in (int): the channel dimension of the input.
+ fusion_conv_channel_ratio (int): channel ratio for the convolution
+ used to fuse from Fast pathway to Slow pathway.
+ fusion_kernel (int): kernel size of the convolution used to fuse
+ from Fast pathway to Slow pathway.
+ alpha (int): the frame rate ratio between the Fast and Slow pathway.
+ eps (float): epsilon for batch norm.
+ """
+ super(FuseFastToSlow, self).__init__()
+ fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+
+ self._conv_f2s = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_in * fusion_conv_channel_ratio,
+ kernel_size=[fusion_kernel, 1, 1],
+ stride=[alpha, 1, 1],
+ padding=[fusion_kernel // 2, 0, 0],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
+ epsilon=eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ def forward(self, x):
+ x_s = x[0]
+ x_f = x[1]
+ fuse = self._conv_f2s(x_f)
+ fuse = self._bn(fuse)
+ fuse = F.relu(fuse)
+ x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
+
+ return [x_s_fuse, x_f]
+```
+
+
+
+## 参考
+[SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982)
diff --git a/docs/zh-CN/tutorials/TSM.md b/docs/zh-CN/tutorials/TSM.md
new file mode 100644
index 0000000000000000000000000000000000000000..cae140867805a231711e28a2d1989a35d4577109
--- /dev/null
+++ b/docs/zh-CN/tutorials/TSM.md
@@ -0,0 +1,75 @@
+# TSM模型原理及PaddleVideo实践
+# 1. 背景与动机
+目前互联网视频数据日益增多,用户观看短视频、小视频的时长也迅速增长,如何对海量的视频资源快速准确地分析、处理、归类是一个亟待解决的问题。视频理解技术可以多维度解析视频内容,理解视频语义,自动分类打标签,极大节省人工审核效率,节约成本;同时实现精准用户推荐,提升体验效果。
+本文将给大家介绍视频理解领域的经典模型**TSM (Temporal Shift Module)**, TSM是由**MIT**和**IBM Watson AI Lab**的`Ji Lin,Chuang Gan和SongHan`等人提出的通过时间位移模拟3D建模达到效果和性能的平衡,提高视频理解能力的模块。
+
+
+
+跟TSM最相关的视频理解模型当属Limin Wang等人在ECCV2016上发表的Temporal Segment Network (TSN)了。TSN模型从视频中采样N帧图像并通过最简单直接地对N帧图像分类结果进行平均的方式进行时序信息融合,取得了当时State-of-the-art的性能,并得到大规模的应用。考虑到TSN模型对时序信息的建模不够充分,以I3D,S3D, P3D等为代表的一系列工作通过3D卷积进行端到端联合时空建模,这一系列工作尽管能捕获时空特征,但是相比TSN,由2D卷积到3D卷积不可避免地引入了额外计算量。TSM巧妙的通过时间维度特征map移位的想法,理论上用零额外计算开销达到了不同帧之间特征融合联合建模的目的。
+
+论文传送门: [Temporal Shift Module for Efficient VideoUnderstanding](https://arxiv.org/pdf/1811.08383v2.pdf)
+
+先看一下下图的例子:如果图片分别从左往右播放和从右往左播放,测试者会给出不同但是正确的理解结果,说明对视频的理解强依赖于视频的时序关系,你猜对了!这就是TSM提出的动机,即捕捉视频的时间信息。
+
+
+
+看起来好像很有意思,我们下面继续深入解析一下TSM的核心模块。
+
+# 2. TSM关键技术介绍
+
+在传统的图片分析的基础上,视频分析需要研究者补充关于时间信息(temporal information)的建模结构。目前,2D CNN和3D CNN是视频理解中最常用的两个方法:使用2D CNN 模型运算量少但会丧失部分时间信息;而使用3D CNN虽然效果好但运算量极大。面对这样的情况,MIT和IBM Watson AI Lab的Ji Lin,Chuang Gan和Song Han等人提出了Temporal Shift Module (TSM)模块。他们将时间位移模块嵌入2D CNN,从而可以在不添加任何额外的计算量和参数的情况下,轻松地达到与3D CNN效果相当的视频理解能力。
+
+
+
+上图中矩阵的行和列分别表示特征图中的temporal和channel维度。在TSM模块中,将一部分的channel在temporal维度上向前位移一步,一部分的channel在temporal维度上向后位移一步,位移后的空缺补零。通过这种方式在特征图中引入temporal维度上的上下文交互,通过通道移动操作可以使得在当前帧中包含了前后两帧的通道信息,这样再进行2D卷积操作就能像3D卷积一样直接提取视频的时空信息,
+提高了模型在时间维度上的建模能力。在此基础上,研发人员将模块进一步细分为适合在线视频使用的TSM模块和适合离线视频使用的TSM模块。
+
+
+
+
+双向(bi-direction)的TSM模块可获取过去和未来的时空信息,适合高吞吐量的离线视频使用;而单向(uni-direction)的TSM模块仅可比对现在和过去的时空信息,适用于低延迟在线视频的识别。
+此外,论文中作者还考虑了TSM模块插入的位置,对比了两种TSM插入方式:**Residual tsm** 和 **In-place tsm**,作者发现使用**Residual tsm**方式会比 **In-place tsm** 的方式效果更好,文中作者解释为**In-place tsm** 会影响模型对空间信息的提取。
+
+
+
+
+
+好了,TSM模块基本原理搞清楚了是不是**So Easy !!!**,接下来问题来了,代码该如何实现呢?
+
+# 3. 关键代码解析
+
+原理搞清楚了,下面来看看代码如何实现,首先我们来看看torch版本如何实现的,呃呃呃...,不好意思torch框架并未提供TSM的API,我们只能自己动手啦,具体实现代码如下图所示:
+
+
+
+
+这意味着你只需要在TSN的代码基础上添加4行代码就能将准确率在Something-Something这样的数据集上**翻上一倍!!!** 是不是简单高效的模型 ?不得不向大佬低头!
+
+But...,
+
+
+飞桨框架充分考虑到广大用户的需求已经为各位童鞋实现了TSM的OP
+
+
+
+
+所以各位童鞋再也不用自己实现了,**直接调用就可以啦!!!,直接调用就可以啦!!!,直接调用就可以啦!!!**,重要的事情讲三遍,
+
+是不是以为事情到这里就结束啦 ? 唉! **Too young Too simple !!!**
+
+我们在此基础上还对其进行了性能优化,在降低显存消耗的同时,可以提速5倍以上,详细信息可以参考[加速文档](./accelerate.md)
+
+下面我们来看看使用飞桨如何实现TSM:
+
+`import paddle.nn.functional as F`
+
+
+`shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)`
+
+两行代码就可以实现TSM了,是不是很简单?
+
+# Reference
+[1] [Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018](https://arxiv.org/pdf/1811.08383v2.pdf).
+
+
+[2] [Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016](https://arxiv.org/abs/1608.00859).
diff --git a/docs/zh-CN/tutorials/TSN.md b/docs/zh-CN/tutorials/TSN.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e6804821d6327366d6f5c642ce754899daafe8a
--- /dev/null
+++ b/docs/zh-CN/tutorials/TSN.md
@@ -0,0 +1,140 @@
+# TSN
+
+## 背景
+TSN 可以看作是对 two-stream 的改进,通过设计有效的卷积网络体系结构 TSN 解决视频动作分类中的两个主要问题:
+* 长距离时序依赖问题(有些动作在视频中持续的时间较长);
+* 解决数据量较少的问题;
+
+## 贡献
+TSN 的贡献可概括为以下两点:
+* TSN 模型基于 long-range temporal structure 建模,结合了 sparse temporal sampling strategy 和 video-level supervision 从而保证对整段视频学习的有效性和高效性;
+* 提出了一系列最佳实践方案;
+
+## 原理
+由于 two-stream 网络处理的是单帧图像(空间网络)或者短片段中的一堆帧图像(时序网络),因此 two-stream 网络无法满足时间跨度较长的视频动作。为了能够处理长范围时序结构的情况,可以使用密集帧采样方式从视频中获取长时间信息,但这样会增加时间成本同时采样到的连续帧之间存在冗余。于是在 TSN 模型中作者使用稀疏采用的方式来替代密集采样,降低计算量的同时一定程度上也去除了冗余信息。
+
+TSN 采用和 two-stream 相似的结构,网络由空间流卷积网络和时间流卷积组成。TSN 使用稀疏采样的方式从整段视频采出一系列的短片段,其中每个片段都会有一个对自身动作类别的初步预测,之后通过对这些片段的预测结果进行“融合”得出对整个视频的预测结果。
+
+## 网络结构
+如下图所示,一个视频被分为  段( segment );之后对每个段使用稀疏采样的方式采出一个片段( snippet );然后使用“段共识函数”对不同片段的预测结果进行融合生成“段共识”,此时完成了一个视频级的预测;最后对所有模式的预测结果进行融合生成最终的预测结果。
+
+
+
+
+
+
+> 这里注意 segment 和 snippet 的区别
+
+TSN 采用与 two-stream 类似的结构,使用空间网络操作一帧 RGB 图像,时序卷积网络操作连续的光流图像。但由于更深的网络结构能够提升对物体的识别能力,因此 TSN 中作者采用 BN-Inception 构建网络。
+
+## 损失函数
+
+给定一段视频 ,按相等间隔分为  段 。 TSN 对一系列片段的建模如下:
+
+
+
+其中,
表示片段序列,从每个段  中随机采样获取对应的片段 ;
表示作用于短片段  的卷积网络, 为网络的参数,返回值为  相对于所有类别的得分;段共识函数  用于融合所有片段的预测结果。预测函数 用于预测整段视频属于每个动作类别的概率,它的输入为段共识函数  的结果。
+
+最后,采用标准分类交叉熵计算部分共识的损失:
+
+
+
+
+其中, 是类别总数; 是类别  的 ;论文中段的数量  设置为 ;共识函数  采用取均值的方式,从所有片段的相同类别中推断出某个类别得分 。
+
+## 模型输入
+对于图像任务而言,只能够使用图像本身提取特征。但对视频来说,除了每一帧图像外,还有视频中的光流信息。为了探索更多输入形式对模型效果影响,TSN 模型在空间卷积网络中除了使用单一 RGB 图像外,还使用了 RGB difference;在时序卷积网络中除了将连续的光流场作为输入外还采用了扭曲的光流场。
+
+
+
+
+
+单一 RGB 图像只能表示静态信息,缺少上下文信息。但连续两帧之间的差异能够表示动作的改变,因此作者尝试将 RGB difference 作为模型的一种输入。
+
+TSN 将光流场作为输入捕获运动信息;将扭曲光流场作为输入抑制背景运动,使得专注于视频中的人物运动。
+
+## 训练
+由于数据集较小,为了避免过拟合,作者提出了一系列的训练策略。
+
+### 数据增强
+通过数据增强可生成额外的训练样本,一定程度上能够避免模型的过拟合。two-stream 中采用的数据增强方式有随机裁剪和水平翻转,在 TSN 中作者新增了两种数据增强方法:
+* 角裁剪:仅从图片的边角或中心提取区域,避免默认关注图片的中心;
+* 尺度抖动:将输入图像或者光流场的大小固定为
,裁剪区域的宽和高随机从  中选择。最终,裁剪区域将被
用于网络训练。
+
+### 交叉预训练
+由于空间网络以 RGB 图片作为输入,因此作者在空间网络上直接使用 ImageNet 预训练模型初始化网络的参数。对于以 RGB difference 和光流作为输入的模型,作者提出了交叉预训练技术,使用 RGB 预训练模型初始化时序网络。首先,通过线性变换将光流场离散到从 0 到 255 的区间,使得光流场和 RGB 的取值范围相同;之后修改 RGB 模型的第一个卷积层,对 RGB 通道上的权重进行取均值操作;然后依据时序网络的输入通道数复制 RGB 均值。该策略能够有效的避免时序网络出现过拟合现象。
+
+### 正则化技术
+由于光流分布和 RGB 分布不同,因此除了第一个 BN 层,其余 BN 层的参数都被固定。此外,为了进一步降低过拟合产生的影响,作者在 BN-Inception 的全局 pooling 层后添加一个额外的 dropout 层,其中空间卷积网络的 dropout 比例设置为 0.8;时序卷积网络的 dropout 比例设置为 0.7。
+
+## 数据集
+模型在 HMDB51 和 UCF101 两个主流的动作识别数据集上进行。其中,HMDB51 数据集包含 51 个动作分类的 6766 个视频剪辑;UCF101 数据集包含 13320 个视频剪辑,共 101 类动作。
+
+## 实现细节
+* 基于动量的小批量随机梯度下降算法,momentum 设置为 0.9;
+* batch size 为 256;
+* 使用 ImageNet 预训练模型对网络权重进行初始化;
+* learning rate 调整,对于空间网络,初始化为 0.01,并且每 2000 次迭代后降变为原来的 0.1 倍,训练过程共迭代 4500 次;对于时序网络,初始化为 0.005,并且在第 12000 和 18000 次迭代之后降为原来的 0.1 倍,训练过程共迭代 20000 次;
+* 使用 TVL1 光流算法来提取正常光流场和扭曲光流场。
+* 8 块 TITANX GPUs
+
+## PaddleVideo
+为了加快 TSN 模型的推理速度,PaddleVideo 去掉了与 RGB difference、光流以及扭曲光流相关的部分。
+
+PaddleVideo 中实现稀疏采样的关键代码:
+```python
+frames_len = results['frames_len'] # 视频中总的帧数
+average_dur = int(int(frames_len) / self.num_seg) # 每段中视频的数量
+frames_idx = [] # 存放采样到的索引
+for i in range(self.num_seg):
+ idx = 0 # 采样的起始位置
+ if not self.valid_mode:
+ # 如果训练
+ if average_dur >= self.seg_len:
+ idx = random.randint(0, average_dur - self.seg_len)
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
+ else:
+ idx = i
+ else:
+ # 如果测试
+ if average_dur >= self.seg_len:
+ idx = (average_dur - 1) // 2
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
+ else:
+ idx = i
+ # 从采样位置采连续的帧
+ for jj in range(idx, idx + self.seg_len):
+ if results['format'] == 'video':
+ frames_idx.append(int(jj % frames_len))
+ elif results['format'] == 'frame':
+ frames_idx.append(jj + 1)
+ else:
+ raise NotImplementedError
+```
+
+PaddleVideo 中实现“段共识”的核心代码:
+```
+# [N * num_segs, in_channels, 7, 7]
+x = self.avgpool2d(x)
+# [N * num_segs, in_channels, 1, 1]
+if self.dropout is not None:
+ x = self.dropout(x)
+# [N * num_seg, in_channels, 1, 1]
+x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+# [N, num_seg, in_channels]
+x = paddle.mean(x, axis=1)
+# [N, 1, in_channels]
+x = paddle.reshape(x, shape=[-1, self.in_channels])
+# [N, in_channels]
+score = self.fc(x)
+```
+
+## 广告时间
+如果文档对您理解 TSN 模型有帮助,欢迎👍star🌟,👏fork,您的支持是我们前进的动力⛽️。
+
+## 参考
+[Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
diff --git a/docs/zh-CN/tutorials/accelerate.md b/docs/zh-CN/tutorials/accelerate.md
new file mode 100644
index 0000000000000000000000000000000000000000..fcf7a655e8db6d54ebcaf6d1f2b0edd765e47a78
--- /dev/null
+++ b/docs/zh-CN/tutorials/accelerate.md
@@ -0,0 +1,242 @@
+简体中文 | [English](../../en/tutorials/accelerate.md)
+
+- [简介](#简介)
+- [模型运算加速](#模型运算加速)
+- [数据读取加速](#数据读取加速)
+- [训练策略加速](#训练策略加速)
+- [分布式训练](#分布式训练)
+
+
+# 简介
+
+视频任务相比于图像任务的训练往往更加耗时,其原因主要有两点:
+- 数据:视频解码耗时。mp4/mkv等视频文件都是经过encode后的压缩文件,通过需要经过解码和抽帧步骤才能得到原始的图像数据流,之后经过图像变换/增强操作才能将其喂入网络进行训练。如果视频帧数多,解码过程极其耗时。
+- 模型:视频任务使用的模型通常有更大的参数量与计算量。为学习时序特征,视频模型一般会使用3D卷积核/(2+1)D/双流网络,这都会使得模型的参数量与计算量大大增加。
+
+本教程介绍如下视频模型训练加速方法:
+
+- 模型上,通过op融合或混合精度训练的方式提升op运算效率
+- 数据上,通过多进程或者并行计算的方式加速数据读取速度
+- 训练策略上,通过multigrid策略减少训练耗时
+- 多机分布式减少训练耗时
+
+以上训练加速方法都已经集成进PaddleVideo中,欢迎试用~
+
+如非特别说明,本教程所有实验的测试环境如下:
+```
+GPU: v100,4卡*16G
+CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+PaddlePaddle: 2.0.0-rc1
+Cuda: 10.2
+```
+
+
+# 模型运算加速
+
+- [OP融合](##OP融合)
+- [混合精度训练](##混合精度训练)
+
+## OP融合
+
+针对[TSM模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsm.md),我们实现了[temporal shift op](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/fluid/layers/temporal_shift_cn.html#temporal-shift),在节省显存的同时加速训练过程。
+
+测试方法:
+使用不同形状的Tensor,以不同的方式实现temporal shift,记录显存占用和运行时间。
+
+测试代码:
+
+- temporal shift op实现方式
+```python
+import time
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+SHAPE = [32, 16, 32, 32]
+#SHAPE = [128, 64, 128, 128]
+
+otl = []
+input = paddle.randn(SHAPE)
+for i in range(10000):
+ t1 = time.time()
+ out1 = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+ t2 = time.time()
+ ot = t2 - t1
+ if i > 1000:
+ otl.append(ot)
+print("op time: ", sum(otl)/len(otl))
+```
+
+- 组合op实现方式
+```python
+import time
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+SHAPE = [32, 16, 32, 32]
+#SHAPE = [128, 64, 128, 128]
+
+def temporal_shift(x, seg_num, shift_ratio):
+ shape = x.shape #[N*T, C, H, W]
+ reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3])) #[N, T, C, H, W]
+ pad_x = paddle.fluid.layers.pad(reshape_x, [0,0,1,1,0,0,0,0,0,0,]) #[N, T+2, C, H, W]
+ c1 = int(shape[1] * shift_ratio)
+ c2 = int(shape[1] * 2 * shift_ratio)
+ slice1 = pad_x[:, :seg_num, :c1, :, :]
+ slice2 = pad_x[:, 2:seg_num+2, c1:c2, :, :]
+ slice3 = pad_x[:, 1:seg_num+1, c2:, :, :]
+ concat_x = paddle.concat([slice1, slice2, slice3], axis=2) #[N, T, C, H, W]
+ return concat_x.reshape(shape)
+
+ctl = []
+input = paddle.randn(SHAPE)
+for i in range(10000):
+ t2 = time.time()
+ out2 = temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+ t3 = time.time()
+ ct = t3 - t2
+ if i > 1000:
+ ctl.append(ct)
+print("combine time: ", sum(ctl)/len(ctl))
+```
+
+性能数据如下:
+
+| 输入tensor形状 | 实现方式 | 显存占用/M| 计算时间/s | 加速比 |
+| :------ | :-----: | :------: | :------: | :------: |
+| 32\*16\*32\*32 |op组合方式 | 1074 | 0.00029325 | baseline |
+| 32\*16\*32\*32 | temporal shift op | 1058 | 0.000045770 | **6.4x** |
+| 128\*64\*128\*128 |op组合方式 | 5160 | 0.0099088 | baseline |
+| 128\*64\*128\*128 | temporal shift op | 2588 | 0.0018617 | **5.3x** |
+
+
+
+## 混合精度训练
+
+Comming soon~
+
+# 数据读取加速
+
+- [更优的解码库Decord](##更优的解码库Decord)
+- [多进程加速Dataloader](##多进程加速Dataloader)
+- [数据预处理DALI](##数据预处理DALI)
+- [预先解码存成图像](##预先解码存成图像)
+
+对于单机训练,视频模型的训练瓶颈大多是在数据预处理上,因此本节主要介绍在数据处理上的一些加速经验。
+
+## 更优的解码库Decord
+
+视频在喂入网络之前,需要经过一系列的数据预处理操作得到数据流,这些操作通常包括:
+
+- 解码: 将视频文件解码成数据流
+- 抽帧: 从视频中抽取部分帧用于网络训练
+- 数据增强:缩放、裁剪、随机翻转、正则化
+
+其中解码是最为耗时的。相较于传统的opencv或pyAV解码库,这里推荐使用性能更优的解码库[decord](https://github.com/dmlc/decord)。目前[SlowFast模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/slowfast.md)使用decord进行视频解码([源码](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/loader/pipelines/decode_sampler.py)),对单进程的速度提升有较大作用。
+
+我们分别以opencv/decord为解码器,实现SlowFast模型数据预处理pipeline,然后随机从kinetics-400数据集中选取200条视频,计算各pipeline处理每条视频的平均时间。
+
+性能测试数据如下:
+
+| 解码库 | 版本 | pipeline处理每条视频的平均时间/s | 加速比 |
+| :------ | :-----: | :------: | :------: |
+| opencv | 4.2.0 | 0.20965035 | baseline |
+| decord | 0.4.2 | 0.13788146 | **1.52x** |
+
+
+## 多进程加速Dataloader
+
+数据准备好后喂入网络进行训练,网络运算使用GPU并行加速相对较快。对于单个进程来说,速度瓶颈大多在数据处理部分,GPU大部分时间是在等待CPU完成数据预处理。
+飞桨2.0使用[Dataloader](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/io/DataLoader_cn.html#dataloader)进行数据加载,DataLoader支持单进程和多进程的数据加载方式,当 num_workers 大于0时,将使用多进程方式异步加载数据。多进程加速协作,可以overlap掉GPU大部分等待的时间,提升GPU利用率,显著加速训练过程。
+
+我们分别设置num_workers为0或4,单卡batch_size统一设置为8,统计训练一个batch的平均耗时。
+
+性能测试数据对比如下:
+| 卡数 | 单卡num_workers | batch_cost/s | ips | 加速比 |
+| :------ | :-----: | :------: |:------: |:------: |
+| 单卡 | 0 | 1.763 | 4.53887 | 单卡baseline |
+| 单卡 | 4 | 0.578 | 13.83729 | **3.04x** |
+| 4卡 | 0 | 1.866 | 4.28733 | 多卡baseline |
+| 4卡 | 4 | 0.615 | 13.00625 | **3.03x** |
+
+其中ips = batch_size/batch_cost,即为训练一个instance(一个video)的平均耗时。
+
+**结合使用decord和飞桨dataloader,加上在数据增强部分做一些细节优化,SlowFast模型训练速度增益为100%,详细数据可以参考[benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/benchmark.md)**。
+
+## 数据预处理DALI
+
+既然GPU等待CPU进行数据处理耗时,能否把数据处理放到GPU上呢?[NVIDIA DALI](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/)将数据预处理pipeline转移到GPU上执行,可以显著提升训练速度。针对视频文件,DALI提供`VideoReader`op进行解码抽帧操作,但目前其仅支持连续采样的方式进行抽帧。而视频领域常用的2D模型TSN或TSM,它们均采用分段采样方式,即把视频均匀分成N段segument,然后在每个segument内随机选取一帧,最后把选取的帧组合作为输入张量。为此,我们基于DALI进行了二次开发,实现了支持分段采样方式的`VideoReader`op。为方便用户使用,我们提供了配置好的docker运行环境,具体使用方法参考[TSN-DALI使用教程](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn_dali.md)。
+
+测试环境:
+```
+机器: Tesla v100
+显存: 4卡16G
+Cuda: 9.0
+单卡batch_size: 32
+```
+
+性能测试数据如下:
+
+| 加速方式 | batch耗时/s | reader耗时/s | ips:instance/sec | 加速比 |
+| :--------------- | :--------: | :------------: | :------------: | :------------: |
+| DALI | 2.083 | 1.804 | 15.36597 | **1.41x** |
+| Dataloader: 单卡num_workers=4 | 2.943 | 2.649 | 10.87460| baseline |
+| pytorch实现 | TODO | TODO | TODO | TODO |
+
+
+## 预先解码存成图像
+
+这是一种简单直接的方法,既然视频解码耗时,那可以事先将视频解码好,存成图片,模型训练时直接读取图像即可。这种方法可以显著提升视频模型训练速度,但它也有一个很明显的缺点,就是需要耗费大量的内存空间。以kinetics-400数据集为例,共包含24万个训练样本,mp4文件约130G,解码存成图像后,占用的内存空间约为2T,所以这种方法比较适用于较小规模的数据集,如ucf-101。PaddleVideo提供了[预先解码](https://github.com/PaddlePaddle/PaddleVideo/blob/main/data/ucf101/extract_rawframes.py)的脚本,并且[TSN模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn.md)和[TSM模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsm.md)均支持直接使用frame格式的数据进行训练,详细实现参考[源码](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/loader/dataset/frame.py)。
+
+
+测试方法: 数据集选用UCF-101,模型为ppTSM,模型参数参考默认配置[pptsm.yaml](https://github.com/PaddlePaddle/PaddleVideo/blob/main/configs/recognition/tsm/pptsm.yaml),Dataloader的num_workers参数设为0,分别以video和frame格式作为输入,单卡训练,性能数据如下:
+
+| 数据格式 | batch耗时/s | reader耗时/s | ips:instance/sec | reader加速比 | 加速比 |
+| :--------------- | :--------: | :------------: | :------------: | :------------: | :------------: |
+| frame | 1.008 | 0.591 | 15.87405 | 4.79x | **3.22x** |
+| video | 3.249 | 2.832 | 4.92392| baseline | baseline |
+
+
+# 训练策略加速
+
+前述方法大多从工程的角度思考训练速度的提升,在算法策略上,FAIR在CVPR 2020中提出了[Multigrid加速策略算法](https://arxiv.org/abs/1912.00998),它的基本思想如下:
+
+在图像分类任务中,若经过预处理后图像的高度和宽度分别为H和W,batch_size为N,则网络输入batch的Tensor形状为`[N, C, H, W]`,其中C等于3,指RGB三个通道。
+对应到视频任务,由于增加了时序通道,输入batch的Tensor形状为`[N, C, T, H, W]`。
+传统的训练策略中,每个batch的输入Tensor形状都是固定的,即都是`[N, C, T, H, W]`。若以高分辨的图像作为输入,即设置较大的`[T, H, W]`,则模型精度会高一些,但训练会更慢;若以低分辨的图像作为输入,即设置较小的`[T, H, W]`,则可以使用更大的batch size,训练更快,但模型精度会降低。在一个epoch中,能否让不同batch的输入Tensor的形状动态变化,既能提升训练速度,又能保证模型精度?
+
+基于以上思想,FAIR在实验的基础上提出了Multigrid训练策略: 固定`N*C*T*H*W`的值,降低`T*H*W`时增大`N`的值,增大`T*H*W`时减小`N`的值。具体包含两种策略:
+
+- Long cycle: 设完整训练需要N个epoch,将整个训练过程分4个阶段,每个阶段对应的输入tensor形状为:
+```
+[8N, T/4, H/sqrt(2), W/sqrt(2)], [4N, T/2, H/sqrt(2), W/sqrt(2)], [2N, T/2, H, W], [N, T, H, W]
+```
+
+- Short cycle: 在Long cycle的基础上,Short-cycle让每个iter的输入Tensor形状都会发生变化,变化策略为:
+```
+[H/2, W/2], [H/sqrt(2), W/sqrt(2)], [H, W]
+```
+
+我们基于飞桨实现了Multigrid训练加速策略,对SlowFast模型训练进行加速,使用方法请参考文档[SlowFast训练加速](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md#%E8%AE%AD%E7%BB%83%E5%8A%A0%E9%80%9F)。
+
+测试环境:
+```
+机器: Tesla v100
+显存: 8卡32G
+Cuda: 9.0
+单卡batch_size: 8
+数据集: Kinetics-400
+Paddle版本: 2.0-rc0
+```
+
+性能数据如下:
+
+| 训练策略 | 单个epoch平均耗时/min | 训练总时间/min | 加速比 |
+| :------ | :-----: | :------: |:------: |
+| Multigrid | 27.25 | 9758(6.7天) | 2.89x |
+| Normal | 78.76 | 15438(10.7天) | base |
+
+# 分布式训练
+
+Comming soon~
diff --git a/docs/zh-CN/tutorials/deployment.md b/docs/zh-CN/tutorials/deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..28084055aeb57efef109eabc538a0a166c8c6551
--- /dev/null
+++ b/docs/zh-CN/tutorials/deployment.md
@@ -0,0 +1,58 @@
+简体中文 | [English](../../en/tutorials/deployment.md)
+
+# 推理
+
+## 如何导出一个用于预测的模型?
+
+为了之后的模型预测和部署,我们需要导出模型结构,模型参数,这里应用了PaddlePaddle最新的动转静能力
+执行脚本 ```tools.export_model.py```
+```python
+python3.7 tools/export_model.py -c 配置文件 -o 输出地址 -p 权重文件
+```
+
+`export_model.py` 中,首先会重新build一个网络,这里注意,有些用于预测的模型初始化参数可能和训练时不一致,请注意更改。
+`export_model.py` 添加了针对TSM的`num_seg`等参数,会用to_static动转静,并调用jit.save来保存预测模型,注意:这里的inputspec需要指定一个`假` 输入来运行网路。
+
+具体原理请参考 [动转静](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/04_dygraph_to_static/index_cn.html) 官方文档。
+
+## 如何检查保存的预测模型正确性?
+
+这里我们提供了```tools/test_export_model.py```脚本用于检查预测模型的正确性。
+
+```python
+python3 tools/test_export_model.py -p 权重文件 -i 导出的模型文件夹地址 -c 配置文件
+```
+
+`test_export_model.py`只是打印了输出的shape信息,可根据实际需求进行更改,完整的测试流程应该包含下一步:使用预测引擎进行推理
+
+## 如何使用预测引擎进行推理?
+
+这里我们提供了```tools/predict.py``` 进行模型推理。
+
+```python
+ python3.7 tools/predict.py -v example.avi --model_file "./inference/example.pdmodel" --params_file "./inference/example.pdiparams" --enable_benchmark=False --model="example" --num_seg=8
+ ```
+
+ 对example.avi进行预测并返回预测结果
+
+ ## 如何测试推理速度
+ 我们提供了统一的测试脚本
+
+ ```python
+ python3.7 tools/predict.py --enable_benchmark=True --model_file=模型文件 --params_file=参数文件
+ ```
+
+ ## 如何使用服务器端C++推理?
+
+ coming soon
+
+ # 部署
+
+ ## 如何使用PaddleHub Serving进行部署?
+
+ coming soon
+
+ ## 如何使用PaddleLite进行端上部署?
+
+ coming soon
+
diff --git a/docs/zh-CN/tutorials/pp-tsm.md b/docs/zh-CN/tutorials/pp-tsm.md
new file mode 100644
index 0000000000000000000000000000000000000000..119db69e3536905caf7d90d51d53b7401f521ca0
--- /dev/null
+++ b/docs/zh-CN/tutorials/pp-tsm.md
@@ -0,0 +1,45 @@
+# PP-TSM高效实用视频识别模型
+
+PP-TSM是PaddleVideo基于TSM优化和改进的视频模型,
+其精度(UCF101和Kinetics400数据集top1)和推理速度均优于TSM论文及其他开源的TSM模型5%,3%以上,
+要求使用PaddlePaddle2.0(可使用pip安装) 或适当的develop版本。
+
+在仅用ImageNet pretrain情况下,PP-TSM在UCF101和Kinetics400数据集top1分别达到89.5%和73.5%,
+在单卡V100上FP32推理速度为147 VPS (基于Kinectics400数据集).
+在单卡V100上开启TensorRT下FP16推理速度为TODO。
+
+pp-TSM在Kinetics400上top1精度为73.5%,是至今为止开源的2D视频模型中在相同条件下的最高性能。
+
+PP-TSM从如下方面优化和提升TSM模型的精度和速度:
+1、基于知识蒸馏的预训练模型 , +1.3%
+2、网络结构微调 ,+2.5%
+3、更优的batch size ,+0.2%
+4、更优的L2正则化 ,+0.3%
+5、label_smoothing ,+0.2%
+6、更优的lr decay ,+0.15%
+7、数据增广 ,+0.3%
+8、更优的epoch num ,+0.15%
+9、bn策略 ,+0.4%
+10、集成PaddleInference进行预测推理
+11、知识蒸馏、优化器等更多TODO策略
+其中,每项策略的精度提升指标参考上述数据(基于ucf101及k400上进行实验)。
+
+## preciseBN
+
+在介绍preciseBN之前,我们先回顾一下BN(Batch Norm)。BN层是一种正则化层,在训练时,它根据当前batch的数据按通道计算的均值和方差,然后进行归一化运算,公式如图:
+
+详细介绍可参考[BatchNorm文档](https://paddlepaddle.org.cn/documentation/docs/zh/2.0-rc1/api/paddle/fluid/dygraph/BatchNorm_cn.html#batchnorm)。
+
+假设训练数据的分布和测试数据的分布是一致的,在训练时我们会计算并保存滑动均值和滑动方差,供测试时使用。滑动均值和滑动方差的计算方式如下:
+
+简单的说,moving_mean等于当前batch计算的均值与历史保存的moving_mean的加权和,即为滑动均值。**但滑动均值并不等于真实的均值**,因此测试时的精度仍会受到一定影响。
+为了提升测试精度,我们需要重新计算一个更加精确的均值,这就是preciseBN的目的。
+
+真实的均值如何计算?最直观的想法是,把所有训练数据组成一个batch,输入网络进行前向传播,每经过一个BN层,计算一下当前特征的均值和方差。
+由于训练样本过多,实际操作中不可能这么做。
+所以近似做法是,网络训练完成后,固定住网络中的参数不动,将所有训练数据分成N个batch,依次输入网络进行前向计算,在这个过程中保存下来每个iter的均值和方差,最终得到所有训练样本精确的均值和方差。
+这就是preciseBN的计算方法。具体实现参考[preciseBN](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/utils/precise_bn.py)。
+
+实际使用时,由于迭代所有训练样本比较耗费时间,一般只会跑200个iter左右。
+
+
diff --git a/docs/zh-CN/tutorials/ppagcn.md b/docs/zh-CN/tutorials/ppagcn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2807847a5b4289144eec5c69f2cfdfae12e89a6
--- /dev/null
+++ b/docs/zh-CN/tutorials/ppagcn.md
@@ -0,0 +1,23 @@
+# PP-AGCN模型详解
+
+---
+
+## 内容
+
+- [ST-GCN模型简介](#ST-GCN模型简介)
+- [PP-AGCN模型改进](#PP-AGCN模型改进)
+
+## ST-GCN模型简介
+
+ST-GCN模型由香港中文大学-商汤科技联合实验室在AAAI 2018中提出,不仅为解决基于人体骨架关键点的人类动作识别问题提供了新颖的思路,在标准的动作识别数据集上也取得了较大的性能提升。
+时空图卷积网络模型ST-GCN通过将图卷积网络(GCN)和时间卷积网络(TCN)结合起来,扩展到时空图模型,设计出了用于行为识别的骨骼点序列通用表示,
+该模型将人体骨骼表示为图,如图2所示,其中图的每个节点对应于人体的一个关节点。图中存在两种类型的边,即符合关节的自然连接的空间边(spatial edge)和在连续的时间步骤中连接相同关节的
+时间边(temporal edge)。在此基础上构建多层的时空图卷积,它允许信息沿着空间和时间两个维度进行整合。
+
+ST-GCN的网络结构大致可以分为三个部分,首先,对网络输入一个五维矩阵(N, C, T, V, M),其中N为视频数据量;C为关节特征向量,包括(x,y,acc);T为视频中抽取的关键帧的数量;
+V表示关节的数量,在本项目中采用25个关节数量;M则是一个视频中的人数,然后再对输入数据进行Batch Normalization批量归一化,接着,通过设计ST-GCN单元,
+引入ATT注意力模型并交替使用GCN图卷积网络和TCN时间卷积网络,对时间和空间维度进行变换,在这一过程中对关节的特征维度进行升维,对关键帧维度进行降维,
+最后,通过调用平均池化层、全连接层,并后接SoftMax层输出,对特征进行分类。
+
+
+## PP-AGCN模型详解
diff --git a/docs/zh-CN/tutorials/reletive_issues b/docs/zh-CN/tutorials/reletive_issues
new file mode 100644
index 0000000000000000000000000000000000000000..6db03e0746244ede15351c469afb11fa9d7784cd
--- /dev/null
+++ b/docs/zh-CN/tutorials/reletive_issues
@@ -0,0 +1,77 @@
+video_path is what ? #4510
+https://github.com/PaddlePaddle/models/issues/4510
+
+关于BSN/BMN模型 #4411
+https://github.com/PaddlePaddle/models/issues/4411
+
+微调nextvald的参数,如何加载部分参数呢 #4367
+https://github.com/PaddlePaddle/models/issues/4367
+
+用TSN视频分类模型进行finetune时的问题 #4358
+https://github.com/PaddlePaddle/models/issues/4358
+
+用paddle视频分类模型进行finetune开发报错。 #4353
+https://github.com/PaddlePaddle/models/issues/4353
+
+BMN/BSN模型评估时报错 #4110
+https://github.com/PaddlePaddle/models/issues/4110
+
+The avg losses are not same for training and validation when the same data are used. #4973
+https://github.com/PaddlePaddle/models/issues/4973
+
+How can I load a pretrained model in AttentionCluster to train my own data? #4972
+https://github.com/PaddlePaddle/models/issues/4972
+
+ETS模型的数据处理 #4957
+https://github.com/PaddlePaddle/models/issues/4957
+
+BMN模型推理出现错误 #4881
+https://github.com/PaddlePaddle/models/issues/4881
+
+C-TCN模型数据集不支持MP4格式,MP4转为pickle文件格式需要提供相应处理工具脚本 #4782
+https://github.com/PaddlePaddle/models/issues/4782
+
+CTCN 有没有使用I3D特征的demo #4756
+https://github.com/PaddlePaddle/models/issues/4756
+
+ctcn的数据集.pkl文件的b'feats'和b'scores'是什么?我注意到ctcn_reader.py只用到了b'scores',是否b'scores'才是需要的特征?还有对应的txt文件是什么?假设我需要把BMN的数据集转化为ctcn的数据集,该怎么做? #4750
+https://github.com/PaddlePaddle/models/issues/4750
+
+使用BMN预训练模型训练的时候报错 #4749
+https://github.com/PaddlePaddle/models/issues/4749
+
+使用BMN进行预测时,输出的json文件 视频ID少了两个字符,所有的文件都是这样 #4745
+https://github.com/PaddlePaddle/models/issues/4745
+
+BMN模型batch_size调小之后loss为nan #4738
+https://github.com/PaddlePaddle/models/issues/4738
+
+BMN的输入问题 #4724
+https://github.com/PaddlePaddle/models/issues/4724
+
+报一个video_tag的BUG #4698
+https://github.com/PaddlePaddle/models/issues/4698
+
+PaddleCV-video-ctcn 训练到Epoch21,iter1365停止不动 #4719
+https://github.com/PaddlePaddle/models/issues/4719
+
+STNET跑模型推断,显卡显存充足,提示显存不足 #4608
+https://github.com/PaddlePaddle/models/issues/4608
+
+训练stnet读取kinetics数据集时出线错误 求解决 #4529
+https://github.com/PaddlePaddle/models/issues/4529
+
+有关CTCN视频动作定位的问题 #4508
+https://github.com/PaddlePaddle/models/issues/4508
+
+谁有这个yt8m 的tfrecord? #4506
+https://github.com/PaddlePaddle/models/issues/4506
+
+The NeXtVLAD final model couldn't be used ??? #4502
+https://github.com/PaddlePaddle/models/issues/4502
+
+Hi, I'm wondering if there is an end-to-end solution for the youtube8M attention_lstm model? #4201
+https://github.com/PaddlePaddle/models/issues/4201
+
+CTCN模型训练一段时间后NAN #4123
+https://github.com/PaddlePaddle/models/issues/4123
diff --git a/docs/zh-CN/tutorials/summarize.md b/docs/zh-CN/tutorials/summarize.md
new file mode 100644
index 0000000000000000000000000000000000000000..c37dc240365b3006f5bd41154a9cc00ab38bdabc
--- /dev/null
+++ b/docs/zh-CN/tutorials/summarize.md
@@ -0,0 +1,138 @@
+# 视频分类和动作识别介绍
+
+## 广泛的应用场景
+视频分类在多个领域上都有广泛的应用,如短视频、推荐、搜索、电视台、广告,安防,监控等领域。
+
+## 多种细分任务
+与图像任务相似,视频任务也可以分为分类(识别)和检测任务两大类,结合不同的场景还可以对这两类任务具体进行细分:
+
++ Task1:修剪视频识别(Trimmed Action Recognition)。输入一段只包含一个动作的修剪视频,输出视频分类,如下图所示:
+
+
+ 行为分类
+
+
+ 从使用的数据模态上区分,分类任务还可以继续细分为基于单模态数据的分类和基于多模态数据的分类,基于RGB图像的分类和基于人体骨架的分类等等,如下图所示:
+
+
+
+ 多种模态
+
+从视频的视角上分还可以分为第一人称视角的行为识别和第三人称视角的行为识别,单一视角的识别和多视角融合的识别,有兴趣的用户可自行查阅相关文献。
+
++ Task2:未修剪视频分类(Untrimmed Video Classification)。与修剪视频识别不同的是,未修剪的视频中通常含有多个动作,而且视频很长。有许多动作或许都不是我们所关注的。通过对输入的长视频进行全局分析,然后软分类到多个类别。
+
++ Task3:时序行为提名(Temporal Action Proposal)。类似于图像目标检测任务中的候选框提取。在一段长视频中通常含有很多动作,任务是从视频中找出可能含有动作的视频段。
+
++ Task4:时序行为定位(Temporal Action Localization)。相比于上面的时序行为提名而言,时序行为定位和我们常说的目标检测一致,要求从视频中找到可能存在行为的视频段,并且给视频段分类,如下图所示:
+
+
+ 行为检测
+
+
++ Task5:密集行为描述(Dense-Captioning Events)。之所以称为密集行为描述,主要是因为该任务要求在时序行为定位(检测)的基础上进行视频行为描述。也就是说,该任务需要将一段**未修剪的视频**进行**时序行为定位**得到许多包含行为的视频段后,并对该视频段进行**行为描述**。
+
+
+## 数据集简介
+
+### 视频分类数据集
+
+模型的训练和验证离不开全面、大量以及具有较好标注的数据集。随着视频行为识别研究的不断深入,越来越多的数据集应用于这一领域的研究。典型的数据集如下:
++ KTH数据集[1](#1)
+
+KTH数据集是一个早期的小型行为识别数据集,包括599段视频6类动作(走、跳、跑、击拳、挥手、拍手)背景相对静止,除了镜头的拉近拉远,摄像机的运动比较轻微。由于该数据集比较小,当训练较大型的3D网络时很容易过拟合,因此当前的大部分研究训练过程多数不基于此数据集。
++ UCF10数据集[2](#2)
+
+UCF101是一个中型数据集视频主要来自于YouTube,包含13320段视频,共101类动作,每类动作由25个人完成,每个人做4-7组动作。在Kinetics数据集发布之前UCF101和HMDB51数据集在很长的一段时间里被作为benchmark用于评估行为识别方法的效果。
++ HMDB51数据集[3](#3)
+
+Brown university大学提出的HMDB51数据集于2011年发布,视频多数来源于电影,还有一部分来自公共数据库以及YouTube等网络视频库。数据库包含有6849段样本,分为51类,每类至少包含有101段样本。
++ Kinetics数据集[4](#4)
+
+Kinetics是当前最为重要的一个大型行为识别数据集,该数据集在2017年由Google的Deepmind团队提出,视频数据同样来自于YouTube,总共400个类别(现已经扩充到700类),30多万段视频数据(Kinetics-700已经扩充到了60多万段视频),每段视频持续10秒左右。动作类别主要分为三大类:“人”,“人与动物”,“人与人互动”。Kinetics数据集可以训练3D-Resnet达到152层而不发生过拟合,解决了之前训练数据集过小难以训练深层3D网络的困境。当前Kinetics已经取代了UCF101和HMDB51成为了行为识别领域的benchmark。当前,大多数研究都采用此数据集进行效果评估和预训练。
++ Something-Something数据集[5](#5)
+
+SomethingV1包含108499段标注视频(V2已经扩展到了220847),每一个时长都在2到6秒之间。这些视频包含了174种类别的动作,与前面的数据集不同此数据集的识别需要更强的时间信息,因此在检验模型时域建模能力方面此数据集具有很重要的参考价值。
+除了以上的主流数据集外目前还有复杂动作识别的Charades[6](#6)数据集、Breakfast Action[7](#7)数据集、以及百万级别的体育视频数据集Sports 1M[8](#8)。
+
+### 检测任务数据集
+
++ THUMOS 2014
+
+来自于THUMOS Challenge 2014,。它的训练集为UCF101数据集,验证集和测试集分别包括1010和1574个未分割的视频片段。在行为检测任务中只有20类动作的未分割视频是有时序行为片段标注的,包括200个验证集(3007个行为片段)和213个测试集视频(包含3358个行为片段)。
+
++ MEXaction2
+
+MEXaction2数据集中包含两类动作:骑马和斗牛。该数据集由三个部分组成:YouTube视频,UCF101中的骑马视频以及INA视频。其中YouTube视频片段和UCF101中的骑马视频是分割好的短视频片段,被用于训练集。而INA视频为多段长的未分割的视频,时长共计77小时,且被分为训练,验证和测试集三部分。训练集中共有1336个行为片段,验证集中有310个行为片段,测试集中有329个行为片断。且MEXaction2数据集的特点是其中的未分割视频长度都非常长,被标注的行为片段仅占视频总长的很低比例。
+
++ ActivityNet
+
+目前最大的数据库,同样包含分类和检测两个任务。这个数据集仅提供视频的youtube链接,而不能直接下载视频,所以还需要用python中的youtube下载工具来自动下载。该数据集包含200个动作类别,20000(训练+验证+测试集)左右的视频,视频时长共计约700小时.
+
+
+## 经典模型简介
+如图所示,动作识别框架主要包括三个步骤:特征提取、运动表示和分类。其中,如何提取视频的时空特征是行为识别和视频分类的核心问题。
+
+
+行为识别框架
+
+依据使用方法的不同可以总体上将行为识别(视频分类)方法概括为基于手工特征方法阶段和基于深度学习方法阶段。基于手工特征的方法阶段比较典型的运动描述子有DTP和IDT,这也是深度学习应用于这一领域之前为大家所公认的最为优秀的运动描述子,感兴趣的读者可以自行查阅文末的相关参考文献。从2014年起,深度学习的方法逐渐开始应用于视频分类领域,目前基于深度学习的方法已经成为了学术界的研究热点,并且在实际的应用效果上看也远远超越了手工设计的运动特征。从2014年至今围绕着如何表征运动特征这一问题,学术界提出了许多经典的网络结构,如下图所示:
+
+
+典型的方法
+
+
+目前Paddlevideo模型库中已经囊括了TSN[9](#9) ,TSM[10](#10),slowfast[11](#11)等经典的行为识别网络,我们后续会陆续对视频领域的经典模型和论文进行详细解析,敬请期待!
+
+
+## 相关比赛介绍
++ [ActivityNet](http://activity-net.org/challenges/2020/challenge.html)
+
+ActivityNet是一个大规模行为识别竞赛,自2016年开始,每年与CVPR同时进行,到今年为止已经连续举办了4届。它侧重于从用户产生的视频中识别出日常生活,高层次,面向目标的活动,视频取自互联网视频门户Youtube。目前,ActivityNet比赛已经成为了行为识别领域影响力最大的比赛。
+
+
+## Reference
+
+
+[1] Schuldt C, Laptev I, Caputo B.Recognizing Human Actions: A Local SVM Approach Proceedings of International Conference on Pattern Recognition. Piscataway, NJ: IEEE, 2004:23-26
+
+
+
+[2] Soomro K, Zamir A R, Shah M. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. arXiv:1212.0402,2012.
+
+
+
+[3] Kuehne H, Jhuang H, Garrote E, et al. HMDB: a large video database for human motion recognition Proceedings of IEEE International Conference on Computer Vision. Piscataway, NJ: IEEE, 2011:2556-2563.
+
+
+
+[4] Carreira J , Zisserman A . Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2017:6299-6308.
+
+
+
+[5] Goyal R, Kahou S E, Michalski V. The “something something” video database for learning and evaluating visual common sense. arXiv:1706.04261,2017.
+
+
+
+[6] Sigurdsson G A , Varol Gül, Wang Xiaolong, et al. Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. arXiv: 604.01753,2016
+
+
+
+[7] Kuehne H, Arslan A, Serre T. The Language of Actions Recovering the Syntax and Semantics of Goal-Directed Human Activities Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014.
+
+
+
+[8] Karpathy A , Toderici G , Shetty S , et al. Large-Scale Video Classification with Convolutional Neural Networks Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014:1725-1732.
+
+
+
+[9] Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016.
+
+
+
+[10] Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018.
+
+
+
+[11] Feichtenhofer C , Fan Haoqi , Malik J , et al. SlowFast Networks for Video Recognition. arXiv:1812.03982,2018.
+
diff --git a/docs/zh-CN/usage.md b/docs/zh-CN/usage.md
new file mode 100644
index 0000000000000000000000000000000000000000..816999993e1edbd64bbb989940b2778ca2695be9
--- /dev/null
+++ b/docs/zh-CN/usage.md
@@ -0,0 +1,189 @@
+简体中文 | [English](../en/start.md)
+
+# 使用指南
+---
+
+* [1. 模型训练](#1)
+* [2. 模型恢复训练](#2)
+* [3. 模型微调](#3)
+* [4. 模型测试](#4)
+* [5. 模型推理](#5)
+
+
+请参考[安装指南](./install.md)配置运行环境,PaddleVideo目前支持Linux下的GPU单卡和多卡运行环境。
+
+
+
+
+## 1. 模型训练
+
+PaddleVideo支持单机单卡和单机多卡训练,单卡训练和多卡训练的启动方式略有不同。
+
+### 1.1 单卡训练
+
+启动脚本示例:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0 #指定使用的GPU显卡id
+python3.7 main.py --validate -c configs_path/your_config.yaml
+```
+- `-c` 必选参数,指定运行的配置文件路径,具体配置参数含义参考[配置文档](./contribute/config.md#config-yaml-details)
+- `--validate` 可选参数,指定训练时是否评估
+- `-o`: 可选参数,指定重写参数,例如: `-o DATASET.batch_size=16` 用于重写train时batch size大小
+
+### 1.2 多卡训练
+
+通过`paddle.distributed.launch`启动,启动脚本示例:
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=your_log_dir main.py --validate -c configs_path/your_config.yaml
+```
+- `--gpus`参数指定使用的GPU显卡id
+- `--log_dir`参数指定日志保存目录
+多卡训练详细说明可以参考[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/guides/02_paddle2.0_develop/06_device_cn.html#danjiduokaxunlian)
+
+
+我们将所有标准的启动命令都放在了```run.sh```中,直接运行(./run.sh)可以方便地启动多卡训练与测试,注意选择想要运行的脚本
+```shell
+sh run.sh
+```
+
+### 1.3 输出日志
+
+运行训练命令,将会输出运行日志,并默认保存在./log目录下,如:`worker.0` , `worker.1` ... , worker日志文件对应每张卡上的输出
+
+【train阶段】打印当前时间,当前epoch/epoch总数,当前batch id,评估指标,耗时,ips等信息:
+```txt
+[09/24 14:13:00] epoch:[ 1/1 ] train step:100 loss: 5.31382 lr: 0.000250 top1: 0.00000 top5: 0.00000 batch_cost: 0.73082 sec, reader_cost: 0.38075 sec, ips: 5.47330 instance/sec.
+```
+
+【eval阶段】打印当前时间,当前epoch/epoch总数,当前batch id,评估指标,耗时,ips等信息:
+```txt
+[09/24 14:16:55] epoch:[ 1/1 ] val step:0 loss: 4.42741 top1: 0.00000 top5: 0.00000 batch_cost: 1.37882 sec, reader_cost: 0.00000 sec, ips: 2.90104 instance/sec.
+```
+
+【epoch结束】打印当前时间,评估指标,耗时,ips等信息:
+```txt
+[09/24 14:18:46] END epoch:1 val loss_avg: 5.21620 top1_avg: 0.02215 top5_avg: 0.08808 avg_batch_cost: 0.04321 sec, avg_reader_cost: 0.00000 sec, batch_cost_sum: 112.69575 sec, avg_ips: 8.41203 instance/sec.
+```
+
+当前为评估结果最好的epoch时,打印最优精度:
+```txt
+[09/24 14:18:47] Already save the best model (top1 acc)0.0221
+```
+
+### 1.4 输出存储路径
+
+- PaddleVideo各文件夹的默认存储路径如下:
+
+```
+PaddleVideo
+ ├── paddlevideo
+ ├── ... #other source codes
+ ├── output #ouput 权重,优化器参数等存储路径
+ | ├── example
+ | | ├── example_best.pdparams #path_to_weights
+ | | └── ...
+ | └── ...
+ ├── log #log存储路径
+ | ├── worker.0
+ | ├── worker.1
+ | └── ...
+ └── inference #预测文件存储路径
+ ├── example.pdiparams file
+ ├── example.pdimodel file
+ └── example.pdiparmas.info file
+```
+
+- 训练Epoch默认从1开始计数,参数文件的保存格式为`ModelName_epoch_00001.pdparams`,命名中的数字对应Epoch编号。
+
+
+
+
+## 2. 模型恢复训练
+
+如果训练任务终止,可以加载断点权重文件(优化器-学习率参数,断点文件)继续训练。
+需要指定`-o resume_epoch`参数,该参数表示从```resume_epoch```轮开始继续训练.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ -c ./configs/example.yaml \
+ --validate \
+ -o resume_epoch=5
+```
+
+
+
+## 3. 模型微调
+
+进行模型微调(Finetune),对自定义数据集进行模型微调,需要指定 `--weights` 参数来加载预训练模型。
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ -c ./configs/example.yaml \
+ --validate \
+ --weights=./output/example/path_to_weights
+```
+
+PaddleVideo会自动**不加载**shape不匹配的参数
+
+
+
+
+## 4. 模型测试
+
+需要指定 `--test`来启动测试模式,并指定`--weights`来加载预训练模型。
+
+```bash
+python3 -m paddle.distributed.launch \
+ --gpus="0,1,2,3" \
+ main.py \
+ -c ./configs/example.yaml \
+ --test \
+ --weights=./output/example/path_to_weights
+```
+
+
+
+## 5. 模型推理
+
+通过导出inference模型,PaddlePaddle支持使用预测引擎进行预测推理。接下来介绍如何用预测引擎进行推理:
+首先,对训练好的模型进行转换
+指定`-c`参数加载配置文件,指定`-p`参数加载模型权重,指定`-o`用于指定转换后模型的存储路径。
+
+```bash
+python tools/export_model.py \
+ -c ./configs/example.yaml \
+ -p ./output/example/path_to_weights \
+ -o ./inference
+```
+
+
+上述命令将生成模型结构文件(`model_name.pdmodel`)和模型权重文件(`model_name.pdiparams`),然后可以使用预测引擎进行推理:
+
+```bash
+python tools/predict.py \
+ --input_file "data/example.avi" \
+ --model_file "./inference/TSN.pdmodel" \
+ --params_file "./inference/TSN.pdiparams" \
+ --use_gpu=True \
+ --use_tensorrt=False
+```
+
+其中:
+
++ `input_file`:待预测的文件路径或文件夹路径,如 `./test.avi`
++ `model_file`:模型结构文件路径,如 `./inference/TSN.pdmodel`
++ `params_file`:模型权重文件路径,如 `./inference/TSN.pdiparams`
++ `use_tensorrt`:是否使用 TesorRT 预测引擎,默认值:`False`
++ `use_gpu`:是否使用 GPU 预测,默认值:`True`
+
+各模型详细的使用文档,可以参考[Models](./model_zoo/README.md)
diff --git a/docs/zh-CN/whl_zh.md b/docs/zh-CN/whl_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..e586ffe58a1227b59b33338b98aba3fd7f5189cc
--- /dev/null
+++ b/docs/zh-CN/whl_zh.md
@@ -0,0 +1,181 @@
+简体中文 | [English](../en/whl_en.md)
+# paddlevideo包使用教程
+
+## 快速开始
+
+### 安装
+
+使用pypi安装
+```bash
+python3.7 -m pip install paddlevideo==0.0.1
+```
+**注意:** 在下载opecv-python的过程中你可能遇到困难,你可以尝试使用其他源进行安装,试一试:
+```
+python3.7 -m pip install opencv-python==4.2.0.32 -i https://pypi.doubanio.com/simple
+```
+
+本地打包whl文件并安装
+```bash
+python3.7 setup.py bdist_wheel
+python3.7 -m pip install dist/paddlevideo-0.0.1-py3-none-any.whl
+```
+
+### 1. 快速开始
+
+* 指定 `video_file='data/example.avi'`, 使用飞桨提供的推理模型 `model_name='ppTSM'`
+
+
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False,use_tensorrt=False)
+video_file='data/example.avi'
+result=clas.predict(video_file)
+print(result)
+```
+
+```
+ >>> result
+ [{'videoname': 'data/example.avi', 'class_ids': [5], 'scores': [0.9621570706367493], 'label_names': ['archery']}]
+```
+
+* 使用命令行方式启动程序
+```bash
+ppvideo --model_name='ppTSM' --video_file='data/example.avi'
+```
+
+```
+ >>> result
+ **********data/example.avi**********
+ [{'videoname': 'data/example.avi', 'class_ids': [5], 'scores': [0.9621570706367493], 'label_names': ['archery']}]
+```
+
+### 2. 参数介绍
+* model_name(str): 模型的名字. 如果不指定`model_file`和`params_file`你需要指定这个参数来使用飞桨提供的在K400数据集上预训练的模型,默认设置为ppTSM
+* video_file(str): 视频文件路径. 支持:本地单一视频文件,包含多个视频文件的文件夹,numpy数组。
+* use_gpu(bool): 是否使用GPU,默认为不使用。
+* num_seg(int): TSN提出的分段采样策略中分段的数量。
+* seg_len(int): 每个分段上采样的帧数。
+* short_size(int): 将帧的短边调整为多少像素,默认为256。
+* target_size(int): 调整帧的尺寸为目标尺寸,默认为224。
+* normalize(bool): 是否对帧进行归一化。默认为True。
+* model_file(str): 推理模型的模型文件(inference.pdmodel)的路径,如果不指定这个参数,你需要指定`model_name`来进行下载。
+* params_file(str): 推理模型的参数文件(inference.pdiparams)的路径,如果不指定这个参数,你需要指定`model_name`来进行下载。
+* batch_size(int): Batch size, 默认为1。
+* use_fp16(bool): 是否使用float16,默认为False。
+* use_tensorrt(bool): 是否使用Tensorrt,默认为False。
+* gpu_mem(int): GPU使用显存大小,默认为8000。
+* top_k(int): 指定返回的top_k,默认为1。
+* enable_mkldnn(bool): 是否使用MKLDNN,默认为False。
+
+
+### 3. 不同使用方式介绍
+
+**我们提供两种不同的使用方式:1.使用python交互式编程 2.使用命令行方式**
+
+* 查看帮助信息
+```bash
+ppvideo -h
+```
+
+* 使用用户指定的模型,你需要指定模型文件的路径 `model_file` 和参数文件路径 `params_file`
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_file='user-specified model path',
+ params_file='parmas path', use_gpu=False, use_tensorrt=False)
+video_file = ''
+result=clas.predict(video_file)
+print(result)
+```
+
+###### bash
+```bash
+ppvideo --model_file='user-specified model path' --params_file='parmas path' --video_file='video path'
+```
+
+* 使用飞桨提供的推理模型进行预测,你需要通过指定 `model_name`参数来选择一个模型对ppvideo进行初始化,这时你不需要指定 `model_file`文件,你所选择的model预训练模型会自动下载到 `BASE_INFERENCE_MODEL_DIR`目录中以 `model_name`命名的文件夹下
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False, use_tensorrt=False)
+video_file = ''
+result = clas.predict(video_file)
+print(result)
+```
+
+###### bash
+```bash
+ppvideo --model_name='ppTSM' --video_file='video path'
+```
+
+* 你可以将 `np.ndarray`形式的数组作为输入,同样以 `--video_file=np.ndarray`方式指定即可
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False, use_tensorrt=False)
+video_file = np.ndarray
+result = clas.predict(video_file)
+```
+
+###### bash
+```bash
+ppvideo --model_name='ppTSM' --video_file=np.ndarray
+```
+
+* 你可以将 `video_file`指定为一个包含多个视频文件的路径,同样也可以指定 `top_k`参数
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False, use_tensorrt=False,top_k=5)
+video_file = '' # it can be video_file folder path which contains all of videos you want to predict.
+result = clas.predict(video_file)
+print(result)
+```
+
+###### bash
+```bash
+paddleclas --model_name='ppTSM' --video_file='video path' --top_k=5
+```
+
+* 你可以指定 `--label_name_path`为你自己的标签文件,**注意** 格式必须为(类别ID 类别名)
+
+```
+0 abseiling
+1 air_drumming
+2 answering_questions
+3 applauding
+4 applying_cream
+5 archery
+......
+```
+
+* 如果你使用的是飞桨提供的推理模型,你不需要指定`label_name_path`,程序将默认使用`data/k400/Kinetics-400_label_list.txt`;如果你想使用你自己训练的模型,你需要提供你自己的label文件,否则模型只能输出预测的分数而没有类别名称
+
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_file= './inference.pdmodel',params_file = './inference.pdiparams',label_name_path='./data/k400/Kinetics-400_label_list.txt',use_gpu=False)
+video_file = '' # it can be video_file folder path which contains all of videos you want to predict.
+result = clas.predict(video_file)
+print(result)
+```
+###### bash
+```bash
+ppvideo --model_file= './inference.pdmodel' --params_file = './inference.pdiparams' --video_file='video path' --label_name_path='./data/k400/Kinetics-400_label_list.txt'
+```
+###### python
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM',use_gpu=False)
+video_file = '' # it can be video_file folder path which contains all of videos you want to predict.
+result = clas.predict(video_file)
+print(result)
+```
+###### bash
+```bash
+ppvideo --model_name='ppTSM' --video_file='video path'
+```
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..feb782a18b3563e0fff20befd21f3e3700f529d0
--- /dev/null
+++ b/main.py
@@ -0,0 +1,139 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import random
+
+import numpy as np
+import paddle
+
+from paddlevideo.tasks import (test_model, train_dali, train_model,
+ train_model_multigrid)
+from paddlevideo.utils import get_config, get_dist_info
+
+
+def parse_args():
+ parser = argparse.ArgumentParser("PaddleVideo train script")
+ parser.add_argument('-c',
+ '--config',
+ type=str,
+ default='configs/example.yaml',
+ help='config file path')
+ parser.add_argument('-o',
+ '--override',
+ action='append',
+ default=[],
+ help='config options to be overridden')
+ parser.add_argument('--test',
+ action='store_true',
+ help='whether to test a model')
+ parser.add_argument('--train_dali',
+ action='store_true',
+ help='whether to use dali to speed up training')
+ parser.add_argument('--multigrid',
+ action='store_true',
+ help='whether to use multigrid training')
+ parser.add_argument('-w',
+ '--weights',
+ type=str,
+ help='weights for finetuning or testing')
+ parser.add_argument('--fleet',
+ action='store_true',
+ help='whether to use fleet run distributed training')
+ parser.add_argument('--amp',
+ action='store_true',
+ help='whether to open amp training.')
+ parser.add_argument(
+ '--amp_level',
+ type=str,
+ default=None,
+ help="optimize level when open amp training, can only be 'O1' or 'O2'.")
+ parser.add_argument(
+ '--validate',
+ action='store_true',
+ help='whether to evaluate the checkpoint during training')
+ parser.add_argument(
+ '--seed',
+ type=int,
+ default=1234,
+ help='fixed all random seeds when the program is running')
+ parser.add_argument(
+ '--max_iters',
+ type=int,
+ default=None,
+ help='max iterations when training(this arg only used in test_tipc)')
+ parser.add_argument(
+ '-p',
+ '--profiler_options',
+ type=str,
+ default=None,
+ help='The option of profiler, which should be in format '
+ '\"key1=value1;key2=value2;key3=value3\".')
+ parser.add_argument('--use_npu',
+ type=bool,
+ default=False,
+ help='whether use npu.')
+
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+ cfg = get_config(args.config, overrides=args.override)
+
+ # set seed if specified
+ seed = args.seed
+ if seed is not None:
+ assert isinstance(
+ seed, int), f"seed must be a integer when specified, but got {seed}"
+ random.seed(seed)
+ np.random.seed(seed)
+ paddle.seed(seed)
+
+ # set amp_level if amp is enabled
+ if args.amp:
+ if args.amp_level is None:
+ args.amp_level = 'O1' # set defaualt amp_level to 'O1'
+ else:
+ assert args.amp_level in [
+ 'O1', 'O2'
+ ], f"amp_level must be 'O1' or 'O2' when amp enabled, but got {args.amp_level}."
+
+ _, world_size = get_dist_info()
+ parallel = world_size != 1
+ if parallel:
+ paddle.distributed.init_parallel_env()
+
+ if args.test:
+ test_model(cfg, weights=args.weights, parallel=parallel)
+ elif args.train_dali:
+ train_dali(cfg, weights=args.weights, parallel=parallel)
+ elif args.multigrid:
+ train_model_multigrid(cfg,
+ world_size=world_size,
+ validate=args.validate)
+ else:
+ train_model(cfg,
+ weights=args.weights,
+ parallel=parallel,
+ validate=args.validate,
+ use_fleet=args.fleet,
+ use_amp=args.amp,
+ amp_level=args.amp_level,
+ max_iters=args.max_iters,
+ profiler_options=args.profiler_options)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/paddlevideo/__init__.py b/paddlevideo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b03acf29696e71c21ed2d7bfc3a908b7f7c9c48
--- /dev/null
+++ b/paddlevideo/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .version import paddlevideo_version
diff --git a/paddlevideo/loader/__init__.py b/paddlevideo/loader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ed9b11a7018369a4df0253eb625d4bf88284f15
--- /dev/null
+++ b/paddlevideo/loader/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import build_dataset, build_dataloader, build_batch_pipeline
+from .dataset import VideoDataset
+from .dali_loader import TSN_Dali_loader, get_input_data
+
+__all__ = [
+ 'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',
+ 'TSN_Dali_loader', 'get_input_data'
+]
diff --git a/paddlevideo/loader/builder.py b/paddlevideo/loader/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..23a65c3bf497881f355f85fb518d3f47e03b46aa
--- /dev/null
+++ b/paddlevideo/loader/builder.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import signal
+import os
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+from .registry import DATASETS, PIPELINES
+from ..utils.build_utils import build
+from .pipelines.compose import Compose
+from paddlevideo.utils import get_logger
+from paddlevideo.utils.multigrid import DistributedShortSampler
+import numpy as np
+
+logger = get_logger("paddlevideo")
+
+
+def build_pipeline(cfg):
+ """Build pipeline.
+ Args:
+ cfg (dict): root config dict.
+ """
+ if cfg == None:
+ return
+ return Compose(cfg)
+
+
+def build_dataset(cfg):
+ """Build dataset.
+ Args:
+ cfg (dict): root config dict.
+
+ Returns:
+ dataset: dataset.
+ """
+ #XXX: ugly code here!
+ cfg_dataset, cfg_pipeline = cfg
+ cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
+ dataset = build(cfg_dataset, DATASETS, key="format")
+ return dataset
+
+
+def build_batch_pipeline(cfg):
+
+ batch_pipeline = build(cfg, PIPELINES)
+ return batch_pipeline
+
+
+def build_dataloader(dataset,
+ batch_size,
+ num_workers,
+ places,
+ shuffle=True,
+ drop_last=True,
+ multigrid=False,
+ collate_fn_cfg=None,
+ **kwargs):
+ """Build Paddle Dataloader.
+
+ XXX explain how the dataloader work!
+
+ Args:
+ dataset (paddle.dataset): A PaddlePaddle dataset object.
+ batch_size (int): batch size on single card.
+ num_worker (int): num_worker
+ shuffle(bool): whether to shuffle the data at every epoch.
+ """
+ if multigrid:
+ sampler = DistributedShortSampler(dataset,
+ batch_sizes=batch_size,
+ shuffle=True,
+ drop_last=True)
+ else:
+ sampler = DistributedBatchSampler(dataset,
+ batch_size=batch_size,
+ shuffle=shuffle,
+ drop_last=drop_last)
+
+ #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
+ # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
+ # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
+
+ def mix_collate_fn(batch):
+ pipeline = build_batch_pipeline(collate_fn_cfg)
+ batch = pipeline(batch)
+ slots = []
+ for items in batch:
+ for i, item in enumerate(items):
+ if len(slots) < len(items):
+ slots.append([item])
+ else:
+ slots[i].append(item)
+ return [np.stack(slot, axis=0) for slot in slots]
+
+ #if collate_fn_cfg is not None:
+ #ugly code here. collate_fn is mix op config
+ # collate_fn = mix_collate_fn(collate_fn_cfg)
+
+ data_loader = DataLoader(
+ dataset,
+ batch_sampler=sampler,
+ places=places,
+ num_workers=num_workers,
+ collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
+ return_list=True,
+ **kwargs)
+
+ return data_loader
+
+
+def term_mp(sig_num, frame):
+ """ kill all child processes
+ """
+ pid = os.getpid()
+ pgid = os.getpgid(os.getpid())
+ logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
+ os.killpg(pgid, signal.SIGKILL)
+ return
+
+
+signal.signal(signal.SIGINT, term_mp)
+signal.signal(signal.SIGTERM, term_mp)
diff --git a/paddlevideo/loader/dali_loader.py b/paddlevideo/loader/dali_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..73fe64fcc5d3a0340b937f6128fdbc8c82cd71c7
--- /dev/null
+++ b/paddlevideo/loader/dali_loader.py
@@ -0,0 +1,208 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import math
+
+from paddle.distributed import ParallelEnv
+import paddle.distributed as dist
+from paddle.fluid.dygraph import to_variable
+from paddlevideo.utils import get_logger
+logger = get_logger("paddlevideo")
+
+try:
+ from nvidia.dali.pipeline import Pipeline
+ import nvidia.dali.ops as ops
+ import nvidia.dali.types as types
+ import tempfile
+ from nvidia.dali.plugin.paddle import DALIGenericIterator
+except:
+ Pipeline = object
+ logger.info(
+ "DALI is not installed, you can improve performance if use DALI")
+
+
+def get_input_data(data):
+ return to_variable(data[0]['image']), to_variable(data[0]['label'])
+
+
+class TSN_Dali_loader(object):
+ def __init__(self, cfg):
+ self.batch_size = cfg.batch_size
+ self.file_path = cfg.file_path
+
+ self.num_seg = cfg.num_seg
+ self.seglen = cfg.seglen
+ self.short_size = cfg.short_size
+ self.target_size = cfg.target_size
+
+ # set num_shards and shard_id when distributed training is implemented
+ self.num_shards = dist.get_world_size()
+ self.shard_id = ParallelEnv().local_rank
+ self.dali_mean = cfg.mean * (self.num_seg * self.seglen)
+ self.dali_std = cfg.std * (self.num_seg * self.seglen)
+
+ def build_dali_reader(self):
+ """
+ build dali training reader
+ """
+ def reader_():
+ with open(self.file_path) as flist:
+ full_lines = [line for line in flist]
+ if (not hasattr(reader_, 'seed')):
+ reader_.seed = 0
+ random.Random(reader_.seed).shuffle(full_lines)
+ logger.info(f"reader shuffle seed: {reader_.seed}.")
+ if reader_.seed is not None:
+ reader_.seed += 1
+
+ per_node_lines = int(
+ math.ceil(len(full_lines) * 1.0 / self.num_shards))
+ total_lines = per_node_lines * self.num_shards
+
+ # aligned full_lines so that it can evenly divisible
+ full_lines += full_lines[:(total_lines - len(full_lines))]
+ assert len(full_lines) == total_lines
+
+ # trainer get own sample
+ lines = full_lines[self.shard_id:total_lines:self.num_shards]
+ assert len(lines) == per_node_lines
+
+ logger.info(
+ f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}"
+ )
+ logger.info(
+ f"read videos from {self.shard_id * per_node_lines}, "
+ f"length: {per_node_lines}, "
+ f"lines length: {len(lines)}, "
+ f"total: {len(full_lines)}")
+
+ video_files = ''
+ for item in lines:
+ video_files += item
+ tf = tempfile.NamedTemporaryFile()
+ tf.write(str.encode(video_files))
+ tf.flush()
+ video_files = tf.name
+
+ device_id = ParallelEnv().local_rank
+ logger.info(f'---------- device_id: {device_id} -----------')
+
+ pipe = VideoPipe(batch_size=self.batch_size,
+ num_threads=1,
+ device_id=device_id,
+ file_list=video_files,
+ sequence_length=self.num_seg * self.seglen,
+ num_seg=self.num_seg,
+ seg_length=self.seglen,
+ resize_shorter_scale=self.short_size,
+ crop_target_size=self.target_size,
+ is_training=True,
+ num_shards=self.num_shards,
+ shard_id=self.shard_id,
+ dali_mean=self.dali_mean,
+ dali_std=self.dali_std)
+
+ logger.info(
+ 'initializing dataset, it will take several minutes if it is too large .... '
+ )
+ video_loader = DALIGenericIterator([pipe], ['image', 'label'],
+ len(lines),
+ dynamic_shape=True,
+ auto_reset=True)
+
+ return video_loader
+
+ dali_reader = reader_()
+ return dali_reader
+
+
+class VideoPipe(Pipeline):
+ def __init__(self,
+ batch_size,
+ num_threads,
+ device_id,
+ file_list,
+ sequence_length,
+ num_seg,
+ seg_length,
+ resize_shorter_scale,
+ crop_target_size,
+ is_training=False,
+ initial_prefetch_size=20,
+ num_shards=1,
+ shard_id=0,
+ dali_mean=0.,
+ dali_std=1.0):
+ super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
+ self.input = ops.VideoReader(device="gpu",
+ file_list=file_list,
+ sequence_length=sequence_length,
+ num_seg=num_seg,
+ seg_length=seg_length,
+ is_training=is_training,
+ num_shards=num_shards,
+ shard_id=shard_id,
+ random_shuffle=is_training,
+ initial_fill=initial_prefetch_size)
+ # the sequece data read by ops.VideoReader is of shape [F, H, W, C]
+ # Because the ops.Resize does not support sequence data,
+ # it will be transposed into [H, W, F, C],
+ # then reshaped to [H, W, FC], and then resized like a 2-D image.
+ self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
+ self.reshape = ops.Reshape(device="gpu",
+ rel_shape=[1.0, 1.0, -1],
+ layout='HWC')
+ self.resize = ops.Resize(device="gpu",
+ resize_shorter=resize_shorter_scale)
+ # crops and mirror are applied by ops.CropMirrorNormalize.
+ # Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
+ # It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
+ self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
+ self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
+ self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
+ self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
+ self.crop_mirror_norm = ops.CropMirrorNormalize(
+ device="gpu",
+ crop=[crop_target_size, crop_target_size],
+ mean=dali_mean,
+ std=dali_std)
+ self.reshape_back = ops.Reshape(
+ device="gpu",
+ shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],
+ layout='FCHW')
+ self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
+
+ def define_graph(self):
+ output, label = self.input(name="Reader")
+ output = self.transpose(output)
+ output = self.reshape(output)
+
+ output = self.resize(output)
+ output = output / 255.
+ pos_x = self.pos_rng_x()
+ pos_y = self.pos_rng_y()
+ mirror_flag = self.mirror_generator()
+ mirror_flag = (mirror_flag > 0.5)
+ mirror_flag = self.cast_mirror(mirror_flag)
+ output = self.crop_mirror_norm(output,
+ crop_pos_x=pos_x,
+ crop_pos_y=pos_y,
+ mirror=mirror_flag)
+ output = self.reshape_back(output)
+ label = self.cast_label(label)
+ return output, label
+
+ def __len__(self):
+ return self.epoch_size()
diff --git a/paddlevideo/loader/dataset/MRI.py b/paddlevideo/loader/dataset/MRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..990cb87bd4f164b8b9eafc91250ffe90f0673649
--- /dev/null
+++ b/paddlevideo/loader/dataset/MRI.py
@@ -0,0 +1,109 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MRIDataset(BaseDataset):
+ """Rawframe dataset for action recognition.
+ The dataset loads raw frames from frame files, and apply specified transform operatation them.
+ The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+ Example of an index file:
+
+ .. code-block:: txt
+
+ file_path-1 150 1
+ file_path-2 160 1
+ file_path-3 170 2
+ file_path-4 180 2
+
+ Args:
+ file_path (str): Path to the index file.
+ pipeline(XXX):
+ data_prefix (str): directory path of the data. Default: None.
+ test_mode (bool): Whether to bulid the test dataset. Default: False.
+ suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+ """
+ def __init__(self,
+ file_path,
+ pipeline,
+ num_retries=5,
+ data_prefix=None,
+ test_mode=False,
+ suffix='img_{:05}.jpg'):
+ self.num_retries = num_retries
+ self.suffix = suffix
+ super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ line_split = line.strip().split()
+ frame_dir, frames_len, labels = line_split
+ if self.data_prefix is not None:
+ frame_dir = osp.join(self.data_prefix, frame_dir)
+ info.append(
+ dict(
+ frame_dir=frame_dir,
+ #suffix=self.suffix,
+ frames_len=frames_len,
+ labels=int(labels)))
+ return info
+
+ def prepare_train(self, idx):
+ """Prepare the frames for training/valid gisven index. """
+ #Try to catch Exception caused by reading missing frames files
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['frame_dir'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return np.array(results['imgs']), np.array([results['labels']])
+
+ def prepare_test(self, idx):
+ """Prepare the frames for test given index. """
+ #Try to catch Exception caused by reading missing frames files
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['frame_dir'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return np.array(results['imgs']), np.array([results['labels']])
diff --git a/paddlevideo/loader/dataset/MRI_SlowFast.py b/paddlevideo/loader/dataset/MRI_SlowFast.py
new file mode 100644
index 0000000000000000000000000000000000000000..db905e4e4bd6bd527609cc2c52aaf7f6c6b96e3f
--- /dev/null
+++ b/paddlevideo/loader/dataset/MRI_SlowFast.py
@@ -0,0 +1,111 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SFMRIDataset(BaseDataset):
+ """Rawframe dataset for action recognition.
+ The dataset loads raw frames from frame files, and apply specified transform operatation them.
+ The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+ Example of an index file:
+
+ .. code-block:: txt
+
+ file_path-1 150 1
+ file_path-2 160 1
+ file_path-3 170 2
+ file_path-4 180 2
+
+ Args:
+ file_path (str): Path to the index file.
+ pipeline(XXX):
+ data_prefix (str): directory path of the data. Default: None.
+ test_mode (bool): Whether to bulid the test dataset. Default: False.
+ suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+ """
+ def __init__(self,
+ file_path,
+ pipeline,
+ num_retries=5,
+ data_prefix=None,
+ test_mode=False,
+ suffix='img_{:05}.jpg'):
+ self.num_retries = num_retries
+ self.suffix = suffix
+ super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ line_split = line.strip().split()
+ frame_dir, frames_len, labels = line_split
+ if self.data_prefix is not None:
+ frame_dir = osp.join(self.data_prefix, frame_dir)
+ info.append(
+ dict(
+ frame_dir=frame_dir,
+ #suffix=self.suffix,
+ frames_len=frames_len,
+ labels=int(labels)))
+ return info
+
+ def prepare_train(self, idx):
+ """Prepare the frames for training/valid gisven index. """
+ #Try to catch Exception caused by reading missing frames files
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['frame_dir'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return np.array(results['imgs'][0]), np.array(
+ results['imgs'][1]), np.array([results['labels']])
+
+ def prepare_test(self, idx):
+ """Prepare the frames for test given index. """
+ #Try to catch Exception caused by reading missing frames files
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['frame_dir'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return np.array(results['imgs'][0]), np.array(
+ results['imgs'][1]), np.array([results['labels']])
diff --git a/paddlevideo/loader/dataset/__init__.py b/paddlevideo/loader/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a1f7aa6876ac3d33bb46edb7e0c9118a5858c6
--- /dev/null
+++ b/paddlevideo/loader/dataset/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .actbert_dataset import ActBertDataset
+from .ava_dataset import AVADataset
+from .bmn_dataset import BMNDataset
+from .davis_dataset import DavisDataset
+from .feature import FeatureDataset
+from .frame import FrameDataset, FrameDataset_Sport
+from .MRI import MRIDataset
+from .MRI_SlowFast import SFMRIDataset
+from .msrvtt import MSRVTTDataset
+from .actbert_dataset import ActBertDataset
+from .asrf_dataset import ASRFDataset
+from .ms_tcn_dataset import MSTCNDataset
+from .oxford import MonoDataset
+from .skeleton import SkeletonDataset
+from .slowfast_video import SFVideoDataset
+from .video import VideoDataset
+
+__all__ = [
+ 'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',
+ 'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',
+ 'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',
+ 'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset'
+]
diff --git a/paddlevideo/loader/dataset/actbert_dataset.py b/paddlevideo/loader/dataset/actbert_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..51ceb80953b017fc5dbf6e282f13a0e264fb27f1
--- /dev/null
+++ b/paddlevideo/loader/dataset/actbert_dataset.py
@@ -0,0 +1,72 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+ import lmdb
+except ImportError as e:
+ print(f"{e}, [lmdb] package and it's dependencies is required for ActBERT.")
+import pickle
+import json
+try:
+ from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+ print(
+ f"{e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+ )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ActBertDataset(BaseDataset):
+ """ActBert dataset.
+ """
+ def __init__(
+ self,
+ file_path,
+ pipeline,
+ bert_model="bert-base-uncased",
+ data_prefix=None,
+ test_mode=False,
+ ):
+ self.bert_model = bert_model
+ super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ feature_data = np.load(self.file_path, allow_pickle=True)
+ self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,
+ do_lower_case=True)
+ self.info = []
+ for item in feature_data:
+ self.info.append(dict(feature=item, tokenizer=self.tokenizer))
+ return self.info
+
+ def prepare_train(self, idx):
+ """Prepare the frames for training/valid given index. """
+ results = copy.deepcopy(self.info[idx])
+ #print('==results==', results)
+ results = self.pipeline(results)
+ return results['features']
+
+ def prepare_test(self, idx):
+ """Prepare the frames for test given index. """
+ pass
diff --git a/paddlevideo/loader/dataset/asrf_dataset.py b/paddlevideo/loader/dataset/asrf_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..15bd35a3c6e3317778d1cef7942f6a8e7a56de6a
--- /dev/null
+++ b/paddlevideo/loader/dataset/asrf_dataset.py
@@ -0,0 +1,104 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ASRFDataset(BaseDataset):
+ """Video dataset for action segmentation.
+ """
+
+ def __init__(
+ self,
+ file_path,
+ pipeline,
+ feature_path,
+ label_path,
+ boundary_path,
+ **kwargs,
+ ):
+ super().__init__(file_path, pipeline, **kwargs)
+ self.label_path = label_path
+ self.boundary_path = boundary_path
+ self.feature_path = feature_path
+
+ def load_file(self):
+ """Load index file to get video information."""
+ file_ptr = open(self.file_path, 'r')
+ info = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+ return info
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID: Prepare data for training/valid given the index."""
+ results = {}
+ video_name = self.info[idx]
+ # load video feature
+ file_name = video_name.split('.')[0] + ".npy"
+ feat_file_path = os.path.join(self.feature_path, file_name)
+ #TODO: check path
+ video_feat = np.load(feat_file_path)
+
+ # load label
+ file_name = video_name.split('.')[0] + ".npy"
+ label_file_path = os.path.join(self.label_path, file_name)
+ label = np.load(label_file_path).astype(np.int64)
+
+ # load boundary
+ file_name = video_name.split('.')[0] + ".npy"
+ boundary_file_path = os.path.join(self.boundary_path, file_name)
+ boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+ results['video_feat'] = copy.deepcopy(video_feat)
+ results['video_label'] = copy.deepcopy(label)
+ results['video_boundary'] = copy.deepcopy(boundary)
+
+ results = self.pipeline(results)
+ return results['video_feat'], results['video_label'], results['video_boundary']
+
+ def prepare_test(self, idx):
+ """TEST: Prepare the data for test given the index."""
+ results = {}
+ video_name = self.info[idx]
+ # load video feature
+ file_name = video_name.split('.')[0] + ".npy"
+ feat_file_path = os.path.join(self.feature_path, file_name)
+ #TODO: check path
+ video_feat = np.load(feat_file_path)
+
+ # load label
+ file_name = video_name.split('.')[0] + ".npy"
+ label_file_path = os.path.join(self.label_path, file_name)
+ label = np.load(label_file_path).astype(np.int64)
+
+ # load boundary
+ file_name = video_name.split('.')[0] + ".npy"
+ boundary_file_path = os.path.join(self.boundary_path, file_name)
+ boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+ results['video_feat'] = copy.deepcopy(video_feat)
+ results['video_label'] = copy.deepcopy(label)
+ results['video_boundary'] = copy.deepcopy(boundary)
+
+ results = self.pipeline(results)
+ return results['video_feat'], results['video_label'], results['video_boundary']
diff --git a/paddlevideo/loader/dataset/ava_dataset.py b/paddlevideo/loader/dataset/ava_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..744e15bb6808d7900d2fd9af9d8e3f6c40ed08c4
--- /dev/null
+++ b/paddlevideo/loader/dataset/ava_dataset.py
@@ -0,0 +1,249 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import sys
+import os
+import pickle
+from datetime import datetime
+from ...metrics.ava_utils import ava_evaluate_results
+from ..registry import DATASETS
+from .base import BaseDataset
+from collections import defaultdict
+
+
+@DATASETS.register()
+class AVADataset(BaseDataset):
+ """AVA dataset for spatial temporal detection.
+ the dataset loads raw frames, bounding boxes, proposals and applies
+ transformations to return the frame tensors and other information.
+ """
+
+ _FPS = 30
+
+ def __init__(self,
+ pipeline,
+ file_path=None,
+ exclude_file=None,
+ label_file=None,
+ suffix='{:05}.jpg',
+ proposal_file=None,
+ person_det_score_thr=0.9,
+ num_classes=81,
+ data_prefix=None,
+ test_mode=False,
+ num_max_proposals=1000,
+ timestamp_start=900,
+ timestamp_end=1800):
+ self.custom_classes = None
+ self.exclude_file = exclude_file
+ self.label_file = label_file
+ self.proposal_file = proposal_file
+ assert 0 <= person_det_score_thr <= 1, (
+ 'The value of '
+ 'person_det_score_thr should in [0, 1]. ')
+ self.person_det_score_thr = person_det_score_thr
+ self.num_classes = num_classes
+ self.suffix = suffix
+ self.num_max_proposals = num_max_proposals
+ self.timestamp_start = timestamp_start
+ self.timestamp_end = timestamp_end
+ super().__init__(
+ file_path,
+ pipeline,
+ data_prefix,
+ test_mode,
+ )
+ if self.proposal_file is not None:
+ self.proposals = self._load(self.proposal_file)
+ else:
+ self.proposals = None
+ if not test_mode:
+ valid_indexes = self.filter_exclude_file()
+ self.info = self.info = [self.info[i] for i in valid_indexes]
+
+ def _load(self, path):
+ f = open(path, 'rb')
+ res = pickle.load(f)
+ f.close()
+ return res
+
+ def parse_img_record(self, img_records):
+ bboxes, labels, entity_ids = [], [], []
+ while len(img_records) > 0:
+ img_record = img_records[0]
+ num_img_records = len(img_records)
+ selected_records = list(
+ filter(
+ lambda x: np.array_equal(x['entity_box'], img_record[
+ 'entity_box']), img_records))
+ num_selected_records = len(selected_records)
+ img_records = list(
+ filter(
+ lambda x: not np.array_equal(x['entity_box'], img_record[
+ 'entity_box']), img_records))
+ assert len(img_records) + num_selected_records == num_img_records
+
+ bboxes.append(img_record['entity_box'])
+ valid_labels = np.array([
+ selected_record['label'] for selected_record in selected_records
+ ])
+
+ label = np.zeros(self.num_classes, dtype=np.float32)
+ label[valid_labels] = 1.
+
+ labels.append(label)
+ entity_ids.append(img_record['entity_id'])
+
+ bboxes = np.stack(bboxes)
+ labels = np.stack(labels)
+ entity_ids = np.stack(entity_ids)
+ return bboxes, labels, entity_ids
+
+ def filter_exclude_file(self):
+ valid_indexes = []
+ if self.exclude_file is None:
+ valid_indexes = list(range(len(self.info)))
+ else:
+ exclude_video_infos = [
+ x.strip().split(',') for x in open(self.exclude_file)
+ ]
+ for i, video_info in enumerate(self.info):
+ valid_indexes.append(i)
+ for video_id, timestamp in exclude_video_infos:
+ if (video_info['video_id'] == video_id
+ and video_info['timestamp'] == int(timestamp)):
+ valid_indexes.pop()
+ break
+ return valid_indexes
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ records_dict_by_img = defaultdict(list)
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ line_split = line.strip().split(',')
+
+ video_id = line_split[0]
+ timestamp = int(line_split[1])
+ img_key = f'{video_id},{timestamp:04d}'
+
+ entity_box = np.array(list(map(float, line_split[2:6])))
+ label = int(line_split[6])
+ entity_id = int(line_split[7])
+ shot_info = (0, (self.timestamp_end - self.timestamp_start) *
+ self._FPS)
+
+ video_info = dict(video_id=video_id,
+ timestamp=timestamp,
+ entity_box=entity_box,
+ label=label,
+ entity_id=entity_id,
+ shot_info=shot_info)
+ records_dict_by_img[img_key].append(video_info)
+
+ for img_key in records_dict_by_img:
+ video_id, timestamp = img_key.split(',')
+ bboxes, labels, entity_ids = self.parse_img_record(
+ records_dict_by_img[img_key])
+ ann = dict(gt_bboxes=bboxes,
+ gt_labels=labels,
+ entity_ids=entity_ids)
+ frame_dir = video_id
+ if self.data_prefix is not None:
+ frame_dir = osp.join(self.data_prefix, frame_dir)
+ video_info = dict(frame_dir=frame_dir,
+ video_id=video_id,
+ timestamp=int(timestamp),
+ img_key=img_key,
+ shot_info=shot_info,
+ fps=self._FPS,
+ ann=ann)
+ info.append(video_info)
+
+ return info
+
+ def prepare_train(self, idx):
+ results = copy.deepcopy(self.info[idx])
+ img_key = results['img_key']
+
+ results['suffix'] = self.suffix
+ results['timestamp_start'] = self.timestamp_start
+ results['timestamp_end'] = self.timestamp_end
+
+ if self.proposals is not None:
+ if img_key not in self.proposals:
+ results['proposals'] = np.array([[0, 0, 1, 1]])
+ results['scores'] = np.array([1])
+ else:
+ proposals = self.proposals[img_key]
+ assert proposals.shape[-1] in [4, 5]
+ if proposals.shape[-1] == 5:
+ thr = min(self.person_det_score_thr, max(proposals[:, 4]))
+ positive_inds = (proposals[:, 4] >= thr)
+ proposals = proposals[positive_inds]
+ proposals = proposals[:self.num_max_proposals]
+ results['proposals'] = proposals[:, :4]
+ results['scores'] = proposals[:, 4]
+ else:
+ proposals = proposals[:self.num_max_proposals]
+ results['proposals'] = proposals
+
+ ann = results.pop('ann')
+ results['gt_bboxes'] = ann['gt_bboxes']
+ results['gt_labels'] = ann['gt_labels']
+ results['entity_ids'] = ann['entity_ids']
+
+ #ret = self.pipeline(results, "")
+ ret = self.pipeline(results)
+ #padding for dataloader
+ len_proposals = ret['proposals'].shape[0]
+ len_gt_bboxes = ret['gt_bboxes'].shape[0]
+ len_gt_labels = ret['gt_labels'].shape[0]
+ len_scores = ret['scores'].shape[0]
+ len_entity_ids = ret['entity_ids'].shape[0]
+ padding_len = 128
+ ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)
+ ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)
+ ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)
+ ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)
+ ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)
+ return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[
+ 'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[
+ 'entity_ids'], np.array(
+ ret['img_shape'], dtype=int
+ ), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids
+
+ def my_padding_2d(self, feat, max_len):
+ feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),
+ dtype=np.float32)
+ feat_pad = np.concatenate((feat, feat_add), axis=0)
+ return feat_pad
+
+ def my_padding_1d(self, feat, max_len):
+ feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)
+ feat_pad = np.concatenate((feat, feat_add), axis=0)
+ return feat_pad
+
+ def prepare_test(self, idx):
+ return self.prepare_train(idx)
+
+ def evaluate(self, results):
+ return ava_evaluate_results(self.info, len(self), results,
+ self.custom_classes, self.label_file,
+ self.file_path, self.exclude_file)
diff --git a/paddlevideo/loader/dataset/base.py b/paddlevideo/loader/dataset/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..2549dc4111a3ba78a85f0088d07458d3907e7abd
--- /dev/null
+++ b/paddlevideo/loader/dataset/base.py
@@ -0,0 +1,80 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import numpy as np
+from abc import ABC, abstractmethod
+
+import paddle
+from paddle.io import Dataset
+
+
+class BaseDataset(Dataset, ABC):
+ """Base class for datasets
+
+ All datasets should subclass it.
+ All subclass should overwrite:
+
+ - Method: `load_file`, load info from index file.
+ - Method: `prepare_train`, providing train data.
+ - Method: `prepare_test`, providing test data.
+
+ Args:
+ file_path (str): index file path.
+ pipeline (Sequence XXX)
+ data_prefix (str): directory path of the data. Default: None.
+ test_mode (bool): whether to build test dataset. Default: False.
+
+ """
+ def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):
+ super().__init__()
+ self.file_path = file_path
+ self.data_prefix = osp.realpath(data_prefix) if \
+ data_prefix is not None and osp.isdir(data_prefix) else data_prefix
+ self.test_mode = test_mode
+ self.pipeline = pipeline
+ self.info = self.load_file()
+
+ @abstractmethod
+ def load_file(self):
+ """load the video information from the index file path."""
+ pass
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID. Prepare the data for training/valid given the index."""
+ #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ #unsqueeze label to list
+ return results['imgs'], np.array([results['labels']])
+
+ def prepare_test(self, idx):
+ """TEST: Prepare the data for test given the index."""
+ #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ #unsqueeze label to list
+ return results['imgs'], np.array([results['labels']])
+
+ def __len__(self):
+ """get the size of the dataset."""
+ return len(self.info)
+
+ def __getitem__(self, idx):
+ """ Get the sample for either training or testing given index"""
+ if self.test_mode:
+ return self.prepare_test(idx)
+ else:
+ return self.prepare_train(idx)
diff --git a/paddlevideo/loader/dataset/bmn_dataset.py b/paddlevideo/loader/dataset/bmn_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c7651914233b7dd659b10d55e24aaafd71f555
--- /dev/null
+++ b/paddlevideo/loader/dataset/bmn_dataset.py
@@ -0,0 +1,72 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class BMNDataset(BaseDataset):
+ """Video dataset for action localization.
+ """
+ def __init__(
+ self,
+ file_path,
+ pipeline,
+ subset,
+ **kwargs,
+ ):
+ self.subset = subset
+ super().__init__(file_path, pipeline, **kwargs)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ annos = json.load(open(self.file_path))
+ for video_name in annos.keys():
+ video_subset = annos[video_name]["subset"]
+ if self.subset in video_subset:
+ info.append(
+ dict(
+ video_name=video_name,
+ video_info=annos[video_name],
+ ))
+ #sort by video_name
+ sort_f = lambda elem: elem['video_name']
+ info.sort(key=sort_f)
+ #add video_idx to info
+ for idx, elem in enumerate(info):
+ info[idx]['video_idx'] = idx
+ logger.info("{} subset video numbers: {}".format(
+ self.subset, len(info)))
+ return info
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID: Prepare data for training/valid given the index."""
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ return results['video_feat'], results['gt_iou_map'], results['gt_start'],\
+ results['gt_end']
+
+ def prepare_test(self, idx):
+ """TEST: Prepare the data for test given the index."""
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ return results['video_feat'], results['gt_iou_map'], results['gt_start'], \
+ results['gt_end'], results['video_idx']
diff --git a/paddlevideo/loader/dataset/davis_dataset.py b/paddlevideo/loader/dataset/davis_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a27597149898d39e0a8426ab0d66aaf4fe4137
--- /dev/null
+++ b/paddlevideo/loader/dataset/davis_dataset.py
@@ -0,0 +1,189 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path as osp
+import copy
+import random
+import numpy as np
+import shutil
+from PIL import Image
+import cv2
+from paddle.io import Dataset
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class VOS_Test(Dataset):
+ """process frames in each video
+ """
+ def __init__(self,
+ image_root,
+ label_root,
+ seq_name,
+ images,
+ labels,
+ pipeline=None,
+ rgb=False,
+ resolution=None):
+ self.image_root = image_root
+ self.label_root = label_root
+ self.seq_name = seq_name
+ self.images = images # image file list
+ self.labels = labels
+ self.obj_num = 1
+ self.num_frame = len(self.images)
+ self.pipeline = pipeline
+ self.rgb = rgb
+ self.resolution = resolution
+
+ self.obj_nums = []
+ temp_obj_num = 0
+ for img_name in self.images:
+ self.obj_nums.append(temp_obj_num)
+ current_label_name = img_name.split('.')[0] + '.png'
+ if current_label_name in self.labels:
+ current_label = self.read_label(current_label_name)
+ if temp_obj_num < np.unique(
+ current_label)[-1]: #get object number from label_id
+ temp_obj_num = np.unique(current_label)[-1]
+
+ def __len__(self):
+ return len(self.images)
+
+ def read_image(self, idx):
+ img_name = self.images[idx]
+ img_path = os.path.join(self.image_root, self.seq_name, img_name)
+ img = cv2.imread(img_path)
+ img = np.array(img, dtype=np.float32)
+ if self.rgb:
+ img = img[:, :, [2, 1, 0]]
+ return img
+
+ def read_label(self, label_name):
+ label_path = os.path.join(self.label_root, self.seq_name, label_name)
+ label = Image.open(label_path)
+ label = np.array(label, dtype=np.uint8)
+ return label
+
+ def __getitem__(self, idx):
+ img_name = self.images[idx]
+ current_img = self.read_image(idx)
+ current_img = np.array(current_img)
+ height, width, channels = current_img.shape
+ if self.resolution is not None:
+ width = int(np.ceil(float(width) * self.resolution / float(height)))
+ height = int(self.resolution)
+
+ current_label_name = img_name.split('.')[0] + '.png'
+ obj_num = self.obj_nums[idx]
+
+ if current_label_name in self.labels:
+ current_label = self.read_label(current_label_name)
+ current_label = np.array(current_label)
+ sample = {
+ 'current_img': current_img,
+ 'current_label': current_label
+ }
+ else:
+ sample = {
+ 'current_img': current_img
+ } #only the first frame contains label
+
+ sample['meta'] = {
+ 'seq_name': self.seq_name,
+ 'frame_num': self.num_frame,
+ 'obj_num': obj_num,
+ 'current_name': img_name,
+ 'height': height,
+ 'width': width,
+ 'flip': False
+ }
+ if self.pipeline is not None:
+ sample = self.pipeline(sample)
+ for s in sample:
+ s['current_img'] = np.array(s['current_img'])
+ if 'current_label' in s.keys():
+ s['current_label'] = s['current_label']
+ return sample
+
+
+@DATASETS.register()
+class DavisDataset(BaseDataset):
+ """Davis 2017 dataset.
+ """
+ def __init__(
+ self,
+ file_path,
+ result_root,
+ pipeline,
+ data_prefix=None,
+ test_mode=False,
+ year=2017,
+ rgb=False,
+ resolution='480p',
+ ):
+ self.rgb = rgb
+ self.result_root = result_root
+ self.resolution = resolution
+ self.year = year
+ self.spt = 'val' if test_mode else 'train'
+ super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+ def load_file(self):
+ self.image_root = os.path.join(self.file_path, 'JPEGImages',
+ self.resolution)
+ self.label_root = os.path.join(self.file_path, 'Annotations',
+ self.resolution)
+ seq_names = []
+ with open(
+ os.path.join(self.file_path, 'ImageSets', str(self.year),
+ self.spt + '.txt')) as f:
+ seqs_tmp = f.readlines()
+ seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+ seq_names.extend(seqs_tmp)
+ self.info = list(np.unique(seq_names))
+ return self.info
+
+ def prepare_test(self, idx):
+ seq_name = self.info[idx] #video name
+ images = list(
+ np.sort(os.listdir(os.path.join(self.image_root, seq_name))))
+ labels = [images[0].replace('jpg', 'png')] #we have first frame target
+
+ # copy first frame target
+ if not os.path.isfile(
+ os.path.join(self.result_root, seq_name, labels[0])):
+ if not os.path.exists(os.path.join(self.result_root, seq_name)):
+ os.makedirs(os.path.join(self.result_root, seq_name))
+ source_label_path = os.path.join(self.label_root, seq_name,
+ labels[0])
+ result_label_path = os.path.join(self.result_root, seq_name,
+ labels[0])
+
+ shutil.copy(source_label_path, result_label_path)
+
+ seq_dataset = VOS_Test(self.image_root,
+ self.label_root,
+ seq_name,
+ images,
+ labels,
+ self.pipeline,
+ rgb=self.rgb,
+ resolution=480)
+ return seq_dataset
diff --git a/paddlevideo/loader/dataset/feature.py b/paddlevideo/loader/dataset/feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf4cd604a801f2b9dc7584ad614fdb1b85356fb
--- /dev/null
+++ b/paddlevideo/loader/dataset/feature.py
@@ -0,0 +1,68 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os.path as osp
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register()
+class FeatureDataset(BaseDataset):
+ """Feature dataset for action recognition
+ Example:(TODO)
+ Args:(TODO)
+ """
+ def __init__(
+ self,
+ file_path,
+ pipeline,
+ data_prefix=None,
+ test_mode=False,
+ suffix=None,
+ ):
+ self.suffix = suffix
+ super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ filename = line.strip()
+ if self.data_prefix is not None:
+ filename = osp.join(self.data_prefix, filename)
+ if self.suffix is not None:
+ filename = filename + self.suffix
+
+ info.append(dict(filename=filename))
+ return info
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID. Prepare the data for training/valid given the index."""
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+
+ return results['rgb_data'], results['rgb_len'], results[
+ 'rgb_mask'], results['audio_data'], results['audio_len'], results[
+ 'audio_mask'], results['labels']
+
+ def prepare_test(self, idx):
+ """TEST. Prepare the data for testing given the index."""
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+
+ return results['rgb_data'], results['rgb_len'], results[
+ 'rgb_mask'], results['audio_data'], results['audio_len'], results[
+ 'audio_mask'], results['labels']
diff --git a/paddlevideo/loader/dataset/frame.py b/paddlevideo/loader/dataset/frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02f526595527a101a34e5e1a7fdd2de092111a3
--- /dev/null
+++ b/paddlevideo/loader/dataset/frame.py
@@ -0,0 +1,177 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class FrameDataset(BaseDataset):
+ """Rawframe dataset for action recognition.
+ The dataset loads raw frames from frame files, and apply specified transform operatation them.
+ The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+ Example of an index file:
+
+ .. code-block:: txt
+
+ file_path-1 150 1
+ file_path-2 160 1
+ file_path-3 170 2
+ file_path-4 180 2
+
+ Args:
+ file_path (str): Path to the index file.
+ pipeline(XXX):
+ data_prefix (str): directory path of the data. Default: None.
+ test_mode (bool): Whether to bulid the test dataset. Default: False.
+ suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+ """
+ def __init__(self,
+ file_path,
+ pipeline,
+ num_retries=5,
+ data_prefix=None,
+ test_mode=False,
+ suffix='img_{:05}.jpg'):
+ self.num_retries = num_retries
+ self.suffix = suffix
+ super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ line_split = line.strip().split()
+ frame_dir, frames_len, labels = line_split
+ if self.data_prefix is not None:
+ frame_dir = osp.join(self.data_prefix, frame_dir)
+ info.append(
+ dict(frame_dir=frame_dir,
+ suffix=self.suffix,
+ frames_len=frames_len,
+ labels=int(labels)))
+ return info
+
+ def prepare_train(self, idx):
+ """Prepare the frames for training/valid given index. """
+ #Try to catch Exception caused by reading missing frames files
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['frame_dir'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return results['imgs'], np.array([results['labels']])
+
+ def prepare_test(self, idx):
+ """Prepare the frames for test given index. """
+ #Try to catch Exception caused by reading missing frames files
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['frame_dir'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return results['imgs'], np.array([results['labels']])
+
+
+@DATASETS.register()
+class FrameDataset_Sport(BaseDataset):
+ """Video dataset for action recognition
+ The dataset loads raw videos and apply specified transforms on them.
+ The index file is a file with multiple lines, and each line indicates
+ a sample video with the filepath and label, which are split with a whitesapce.
+ Example of a inde file:
+ .. code-block:: txt
+ path/000.mp4 1
+ path/001.mp4 1
+ path/002.mp4 2
+ path/003.mp4 2
+ Args:
+ file_path(str): Path to the index file.
+ pipeline(XXX): A sequence of data transforms.
+ **kwargs: Keyword arguments for ```BaseDataset```.
+ """
+ def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+ self.num_retries = num_retries
+ self.suffix = suffix
+ super().__init__(file_path, pipeline, **kwargs)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ line_split = line.strip().split()
+ frame_dir = line_split[0]
+ if self.data_prefix is not None:
+ frame_dir = osp.join(self.data_prefix, frame_dir)
+ info.append(dict(frame_dir=frame_dir, suffix=self.suffix))
+ return info
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID. Prepare the data for training/valid given the index."""
+ #Try to catch Exception caused by reading corrupted video file
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['filename'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return results['imgs'], np.array([results['labels']])
+
+ def prepare_test(self, idx):
+ """TEST. Prepare the data for test given the index."""
+ #Try to catch Exception caused by reading corrupted video file
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['filename'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return results['imgs'], np.array([results['labels']])
diff --git a/paddlevideo/loader/dataset/ms_tcn_dataset.py b/paddlevideo/loader/dataset/ms_tcn_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e3b7bbba14fa94f4d029794c68ee3746c022bb
--- /dev/null
+++ b/paddlevideo/loader/dataset/ms_tcn_dataset.py
@@ -0,0 +1,110 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSTCNDataset(BaseDataset):
+ """Video dataset for action segmentation.
+ """
+
+ def __init__(
+ self,
+ file_path,
+ pipeline,
+ feature_path,
+ gt_path,
+ actions_map_file_path,
+ **kwargs,
+ ):
+ super().__init__(file_path, pipeline, **kwargs)
+ self.gt_path = gt_path
+ self.actions_map_file_path = actions_map_file_path
+ self.feature_path = feature_path
+
+ # actions dict generate
+ file_ptr = open(self.actions_map_file_path, 'r')
+ actions = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+ self.actions_dict = dict()
+ for a in actions:
+ self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+ self.num_classes = len(self.actions_dict.keys())
+
+ def load_file(self):
+ """Load index file to get video information."""
+ file_ptr = open(self.file_path, 'r')
+ info = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+ return info
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID: Prepare data for training/valid given the index."""
+ results = {}
+ video_name = self.info[idx]
+ # load video feature
+ file_name = video_name.split('.')[0] + ".npy"
+ feat_file_path = os.path.join(self.feature_path, file_name)
+ #TODO: check path
+ video_feat = np.load(feat_file_path)
+
+ # load label
+ target_file_path = os.path.join(self.gt_path, video_name)
+ file_ptr = open(target_file_path, 'r')
+ content = file_ptr.read().split('\n')[:-1]
+ classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')
+ for i in range(len(classes)):
+ classes[i] = self.actions_dict[content[i]]
+ # classes = classes * (-100)
+
+ results['video_feat'] = copy.deepcopy(video_feat)
+ results['video_gt'] = copy.deepcopy(classes)
+
+ results = self.pipeline(results)
+ return results['video_feat'], results['video_gt']
+
+ def prepare_test(self, idx):
+ """TEST: Prepare the data for test given the index."""
+ results = {}
+ video_name = self.info[idx]
+ # load video feature
+ file_name = video_name.split('.')[0] + ".npy"
+ feat_file_path = os.path.join(self.feature_path, file_name)
+ #TODO: check path
+ video_feat = np.load(feat_file_path)
+
+ # load label
+ target_file_path = os.path.join(self.gt_path, video_name)
+ file_ptr = open(target_file_path, 'r')
+ content = file_ptr.read().split('\n')[:-1]
+ classes = np.zeros(min(np.shape(video_feat)[1], len(content)))
+ for i in range(len(classes)):
+ classes[i] = self.actions_dict[content[i]]
+ # classes = classes * (-100)
+
+ results['video_feat'] = copy.deepcopy(video_feat)
+ results['video_gt'] = copy.deepcopy(classes)
+
+ results = self.pipeline(results)
+ return results['video_feat'], results['video_gt']
diff --git a/paddlevideo/loader/dataset/msrvtt.py b/paddlevideo/loader/dataset/msrvtt.py
new file mode 100644
index 0000000000000000000000000000000000000000..accd6c6d154904e266526727d25c8a39d8cac208
--- /dev/null
+++ b/paddlevideo/loader/dataset/msrvtt.py
@@ -0,0 +1,218 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+ import lmdb
+except ImportError as e:
+ print(f"{e}, [lmdb] package and it's dependencies is required for ActBERT.")
+import pickle
+try:
+ from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+ print(
+ f"{e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+ )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSRVTTDataset(BaseDataset):
+ """MSR-VTT dataset for text-video clip retrieval.
+ """
+ def __init__(
+ self,
+ file_path,
+ pipeline,
+ features_path,
+ bert_model="bert-base-uncased",
+ padding_index=0,
+ max_seq_length=36,
+ max_region_num=36,
+ max_action_num=5,
+ vision_feature_dim=2048,
+ action_feature_dim=2048,
+ spatials_dim=5,
+ data_prefix=None,
+ test_mode=False,
+ ):
+ self.features_path = features_path
+ self.bert_model = bert_model
+ self.padding_index = padding_index
+ self.max_seq_length = max_seq_length
+ self.max_region_num = max_region_num
+ self._max_action_num = max_action_num
+ self.vision_feature_dim = vision_feature_dim
+ self.action_feature_dim = action_feature_dim
+ self.spatials_dim = spatials_dim
+ self._tokenizer = BertTokenizer.from_pretrained(bert_model,
+ do_lower_case=True)
+ super().__init__(file_path, pipeline, data_prefix, test_mode)
+ self.tokenize()
+ self.gen_feature()
+
+ def load_file(self):
+ """Load index file to get video information."""
+ with open(self.file_path) as fin:
+ self.image_entries = []
+ self.caption_entries = []
+ for line in fin.readlines():
+ line = line.strip()
+ vid_id = line.split(',')[0]
+ self.image_entries.append(vid_id)
+ self.caption_entries.append({
+ "caption": line.split(',')[1],
+ "vid_id": vid_id
+ })
+ self.env = lmdb.open(self.features_path)
+
+ def tokenize(self):
+ for entry in self.caption_entries:
+ tokens = []
+ tokens.append("[CLS]")
+ for token in self._tokenizer.tokenize(entry["caption"]):
+ tokens.append(token)
+ tokens.append("[SEP]")
+ tokens = self._tokenizer.convert_tokens_to_ids(tokens)
+
+ segment_ids = [0] * len(tokens)
+ input_mask = [1] * len(tokens)
+
+ if len(tokens) < self.max_seq_length:
+ padding = [self.padding_index
+ ] * (self.max_seq_length - len(tokens))
+ tokens = tokens + padding
+ input_mask += padding
+ segment_ids += padding
+
+ entry["token"] = np.array(tokens).astype('int64')
+ entry["input_mask"] = np.array(input_mask)
+ entry["segment_ids"] = np.array(segment_ids).astype('int64')
+
+ def get_image_feature(self, video_id):
+ video_id = str(video_id).encode()
+ with self.env.begin(write=False) as txn:
+ item = pickle.loads(txn.get(video_id))
+ video_id = item["video_id"]
+ image_h = int(item["image_h"])
+ image_w = int(item["image_w"])
+
+ features = item["features"].reshape(-1, self.vision_feature_dim)
+ boxes = item["boxes"].reshape(-1, 4)
+
+ num_boxes = features.shape[0]
+ g_feat = np.sum(features, axis=0) / num_boxes
+ num_boxes = num_boxes + 1
+ features = np.concatenate(
+ [np.expand_dims(g_feat, axis=0), features], axis=0)
+
+ action_features = item["action_features"].reshape(
+ -1, self.action_feature_dim)
+
+ image_location = np.zeros((boxes.shape[0], self.spatials_dim),
+ dtype=np.float32)
+ image_location[:, :4] = boxes
+ image_location[:,
+ 4] = ((image_location[:, 3] - image_location[:, 1]) *
+ (image_location[:, 2] - image_location[:, 0]) /
+ (float(image_w) * float(image_h)))
+
+ image_location[:, 0] = image_location[:, 0] / float(image_w)
+ image_location[:, 1] = image_location[:, 1] / float(image_h)
+ image_location[:, 2] = image_location[:, 2] / float(image_w)
+ image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+ g_location = np.array([0, 0, 1, 1, 1])
+ image_location = np.concatenate(
+ [np.expand_dims(g_location, axis=0), image_location], axis=0)
+ return features, num_boxes, image_location, action_features
+
+ def gen_feature(self):
+ num_inst = len(self.image_entries) #1000
+ self.features_all = np.zeros(
+ (num_inst, self.max_region_num, self.vision_feature_dim))
+ self.action_features_all = np.zeros(
+ (num_inst, self._max_action_num, self.action_feature_dim))
+ self.spatials_all = np.zeros(
+ (num_inst, self.max_region_num, self.spatials_dim))
+ self.image_mask_all = np.zeros((num_inst, self.max_region_num))
+ self.action_mask_all = np.zeros((num_inst, self._max_action_num))
+
+ for i, image_id in enumerate(self.image_entries):
+ features, num_boxes, boxes, action_features = self.get_image_feature(
+ image_id)
+
+ mix_num_boxes = min(int(num_boxes), self.max_region_num)
+ mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))
+ mix_features_pad = np.zeros(
+ (self.max_region_num, self.vision_feature_dim))
+
+ image_mask = [1] * (int(mix_num_boxes))
+ while len(image_mask) < self.max_region_num:
+ image_mask.append(0)
+ action_mask = [1] * (self._max_action_num)
+ while len(action_mask) < self._max_action_num:
+ action_mask.append(0)
+
+ mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
+ mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
+
+ self.features_all[i] = mix_features_pad
+ x = action_features.shape[0]
+ self.action_features_all[i][:x] = action_features[:]
+ self.image_mask_all[i] = np.array(image_mask)
+ self.action_mask_all[i] = np.array(action_mask)
+ self.spatials_all[i] = mix_boxes_pad
+
+ self.features_all = self.features_all.astype("float32")
+ self.action_features_all = self.action_features_all.astype("float32")
+ self.image_mask_all = self.image_mask_all.astype("int64")
+ self.action_mask_all = self.action_mask_all.astype("int64")
+ self.spatials_all = self.spatials_all.astype("float32")
+
+ def prepare_train(self, idx):
+ pass
+
+ def prepare_test(self, idx):
+ entry = self.caption_entries[idx]
+ caption = entry["token"]
+ input_mask = entry["input_mask"]
+ segment_ids = entry["segment_ids"]
+
+ target_all = np.zeros(1000)
+ for i, image_id in enumerate(self.image_entries):
+ if image_id == entry["vid_id"]:
+ target_all[i] = 1
+
+ return (
+ caption,
+ self.action_features_all,
+ self.features_all,
+ self.spatials_all,
+ segment_ids,
+ input_mask,
+ self.image_mask_all,
+ self.action_mask_all,
+ target_all,
+ )
+
+ def __len__(self):
+ return len(self.caption_entries)
diff --git a/paddlevideo/loader/dataset/oxford.py b/paddlevideo/loader/dataset/oxford.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9e65c6981d59b69408ebea268cb235e4dfd37b0
--- /dev/null
+++ b/paddlevideo/loader/dataset/oxford.py
@@ -0,0 +1,62 @@
+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+
+from __future__ import absolute_import, division, print_function
+
+import copy
+from os import path as osp
+
+from PIL import Image
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+def pil_loader(path):
+ # open path as file to avoid ResourceWarning
+ # (https://github.com/python-pillow/Pillow/issues/835)
+ with open(path, 'rb') as f:
+ with Image.open(f) as img:
+ return img.convert('RGB')
+
+
+@DATASETS.register()
+class MonoDataset(BaseDataset):
+ def __init__(self,
+ file_path,
+ data_prefix,
+ pipeline,
+ num_retries=0,
+ suffix='.png',
+ **kwargs):
+ self.num_retries = num_retries
+ self.suffix = suffix
+ super().__init__(file_path, pipeline, data_prefix, **kwargs)
+
+ def load_file(self):
+ info = []
+ with open(self.file_path, 'r') as f:
+ for line in f:
+ filename = line.strip() + self.suffix
+ folder = osp.dirname(filename)
+ frame_index = line.strip().split('/')[1]
+ info.append(
+ dict(data_path=self.data_prefix,
+ filename=filename,
+ folder=folder,
+ frame_index=int(frame_index)))
+ return info
+
+ def prepare_train(self, idx):
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ results['imgs']['idx'] = idx
+ return results['imgs'], results['day_or_night']
+
+ def prepare_test(self, idx):
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ return results['imgs'], results['day_or_night']
diff --git a/paddlevideo/loader/dataset/skeleton.py b/paddlevideo/loader/dataset/skeleton.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a3f3e70fa2869d663b0d089a8521be9bfaaf1d
--- /dev/null
+++ b/paddlevideo/loader/dataset/skeleton.py
@@ -0,0 +1,78 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import pickle
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SkeletonDataset(BaseDataset):
+ """
+ Skeleton dataset for action recognition.
+ The dataset loads skeleton feature, and apply norm operatations.
+ Args:
+ file_path (str): Path to the index file.
+ pipeline(obj): Define the pipeline of data preprocessing.
+ data_prefix (str): directory path of the data. Default: None.
+ test_mode (bool): Whether to bulid the test dataset. Default: False.
+ """
+ def __init__(self, file_path, pipeline, label_path=None, test_mode=False):
+ self.label_path = label_path
+ super().__init__(file_path, pipeline, test_mode=test_mode)
+
+ def load_file(self):
+ """Load feature file to get skeleton information."""
+ logger.info("Loading data, it will take some moment...")
+ self.data = np.load(self.file_path)
+ if self.label_path:
+ if self.label_path.endswith('npy'):
+ self.label = np.load(self.label_path)
+ elif self.label_path.endswith('pkl'):
+ with open(self.label_path, 'rb') as f:
+ sample_name, self.label = pickle.load(f)
+ else:
+ logger.info(
+ "Label path not provided when test_mode={}, here just output predictions."
+ .format(self.test_mode))
+ logger.info("Data Loaded!")
+ return self.data # used for __len__
+
+ def prepare_train(self, idx):
+ """Prepare the feature for training/valid given index. """
+ results = dict()
+ results['data'] = copy.deepcopy(self.data[idx])
+ results['label'] = copy.deepcopy(self.label[idx])
+ results = self.pipeline(results)
+ return results['data'], results['label']
+
+ def prepare_test(self, idx):
+ """Prepare the feature for test given index. """
+ results = dict()
+ results['data'] = copy.deepcopy(self.data[idx])
+ if self.label_path:
+ results['label'] = copy.deepcopy(self.label[idx])
+ results = self.pipeline(results)
+ return results['data'], results['label']
+ else:
+ results = self.pipeline(results)
+ return [results['data']]
diff --git a/paddlevideo/loader/dataset/slowfast_video.py b/paddlevideo/loader/dataset/slowfast_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..1adf89c5404acd864fa9b9392e8fcae2598894af
--- /dev/null
+++ b/paddlevideo/loader/dataset/slowfast_video.py
@@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+@DATASETS.register()
+class SFVideoDataset(BaseDataset):
+ """Video dataset for action recognition
+ The dataset loads raw videos and apply specified transforms on them.
+
+ The index file is a file with multiple lines, and each line indicates
+ a sample video with the filepath and label, which are split with a whitesapce.
+ Example of a inde file:
+
+ .. code-block:: txt
+
+ path/000.mp4 1
+ path/001.mp4 1
+ path/002.mp4 2
+ path/003.mp4 2
+
+ Args:
+ file_path(str): Path to the index file.
+ pipeline(XXX): A sequence of data transforms.
+ num_ensemble_views(int): temporal segment when multi-crop test
+ num_spatial_crops(int): spatial crop number when multi-crop test
+ **kwargs: Keyword arguments for ```BaseDataset```.
+
+ """
+ def __init__(
+ self,
+ file_path,
+ pipeline,
+ num_ensemble_views=1,
+ num_spatial_crops=1,
+ num_retries=5,
+ num_samples_precise_bn=None,
+ **kwargs,
+ ):
+ self.num_ensemble_views = num_ensemble_views
+ self.num_spatial_crops = num_spatial_crops
+ self.num_retries = num_retries
+ self.num_samples_precise_bn = num_samples_precise_bn
+ super().__init__(file_path, pipeline, **kwargs)
+ #set random seed
+ random.seed(0)
+ np.random.seed(0)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ line_split = line.strip().split()
+ filename, labels = line_split
+ if self.data_prefix is not None:
+ filename = osp.join(self.data_prefix, filename)
+ for tidx in range(self.num_ensemble_views):
+ for sidx in range(self.num_spatial_crops):
+ info.append(
+ dict(
+ filename=filename,
+ labels=int(labels),
+ temporal_sample_index=tidx,
+ spatial_sample_index=sidx,
+ temporal_num_clips=self.num_ensemble_views,
+ spatial_num_clips=self.num_spatial_crops,
+ ))
+ return info
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID. Prepare the data for training given the index."""
+ #Try to catch Exception caused by reading corrupted video file
+ short_cycle = False
+ if isinstance(idx, tuple):
+ idx, short_cycle_idx = idx
+ short_cycle = True
+ for ir in range(self.num_retries):
+ try:
+ #Multi-grid short cycle
+ if short_cycle:
+ results = copy.deepcopy(self.info[idx])
+ results['short_cycle_idx'] = short_cycle_idx
+ else:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['filename'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+
+ return results['imgs'][0], results['imgs'][1], np.array(
+ [results['labels']])
+
+ def prepare_test(self, idx):
+ """TEST. Prepare the data for test given the index."""
+ #Try to catch Exception caused by reading corrupted video file
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['filename'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return results['imgs'][0], results['imgs'][1], np.array(
+ [results['labels']]), np.array([idx])
+
+ def __len__(self):
+ """get the size of the dataset."""
+ if self.num_samples_precise_bn is None:
+ return len(self.info)
+ else:
+ random.shuffle(self.info)
+ return min(self.num_samples_precise_bn, len(self.info))
diff --git a/paddlevideo/loader/dataset/video.py b/paddlevideo/loader/dataset/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2d8f897a2ce00796f85354161c026205fe6001e
--- /dev/null
+++ b/paddlevideo/loader/dataset/video.py
@@ -0,0 +1,95 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class VideoDataset(BaseDataset):
+ """Video dataset for action recognition
+ The dataset loads raw videos and apply specified transforms on them.
+ The index file is a file with multiple lines, and each line indicates
+ a sample video with the filepath and label, which are split with a whitesapce.
+ Example of a inde file:
+ .. code-block:: txt
+ path/000.mp4 1
+ path/001.mp4 1
+ path/002.mp4 2
+ path/003.mp4 2
+ Args:
+ file_path(str): Path to the index file.
+ pipeline(XXX): A sequence of data transforms.
+ **kwargs: Keyword arguments for ```BaseDataset```.
+ """
+ def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+ self.num_retries = num_retries
+ self.suffix = suffix
+ super().__init__(file_path, pipeline, **kwargs)
+
+ def load_file(self):
+ """Load index file to get video information."""
+ info = []
+ with open(self.file_path, 'r') as fin:
+ for line in fin:
+ line_split = line.strip().split()
+ filename, labels = line_split
+ #TODO(hj): Required suffix format: may mp4/avi/wmv
+ filename = filename + self.suffix
+ if self.data_prefix is not None:
+ filename = osp.join(self.data_prefix, filename)
+ info.append(dict(filename=filename, labels=int(labels)))
+ return info
+
+ def prepare_train(self, idx):
+ """TRAIN & VALID. Prepare the data for training/valid given the index."""
+ #Try to catch Exception caused by reading corrupted video file
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['filename'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return results['imgs'], np.array([results['labels']])
+
+ def prepare_test(self, idx):
+ """TEST. Prepare the data for test given the index."""
+ #Try to catch Exception caused by reading corrupted video file
+ for ir in range(self.num_retries):
+ try:
+ results = copy.deepcopy(self.info[idx])
+ results = self.pipeline(results)
+ except Exception as e:
+ #logger.info(e)
+ if ir < self.num_retries - 1:
+ logger.info(
+ "Error when loading {}, have {} trys, will try again".
+ format(results['filename'], ir))
+ idx = random.randint(0, len(self.info) - 1)
+ continue
+ return results['imgs'], np.array([results['labels']])
diff --git a/paddlevideo/loader/pipelines/__init__.py b/paddlevideo/loader/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfd808f060d94cca3e7ced2c81bd4653cf8c26b9
--- /dev/null
+++ b/paddlevideo/loader/pipelines/__init__.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat
+from .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,
+ GroupResize, Image2Array, JitterScale, MultiCrop,
+ Normalization, PackOutput, RandomCrop, RandomFlip,
+ RandomResizedCrop, Scale, TenCrop, ToArray,
+ UniformCrop)
+from .augmentations_ava import *
+from .compose import Compose
+from .decode import FeatureDecoder, FrameDecoder, VideoDecoder
+from .decode_image import ImageDecoder
+from .decode_sampler import DecodeSampler
+from .mix import Cutmix, Mixup, VideoMix
+from .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize
+from .sample import Sampler, SamplerPkl
+from .sample_ava import *
+from .segmentation import MultiNorm, MultiRestrictSize
+from .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm
+from .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation
+from .decode_sampler_MRI import SFMRI_DecodeSampler
+from .segmentation_pipline import SegmentationSampler
+
+__all__ = [
+ 'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',
+ 'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',
+ 'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',
+ 'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',
+ 'GetVideoLabel', 'Cutmix', 'CenterCrop', 'RandomCrop', 'LoadFeat',
+ 'RandomCap', 'JitterScale', 'Iden', 'VideoMix', 'ColorJitter', 'RandomFlip',
+ 'ToArray', 'FeaturePadding', 'GetMatchMap', 'GroupRandomFlip', 'MultiCrop',
+ 'SFMRI_DecodeSampler', 'MultiRestrictSize', 'MultiNorm',
+ 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',
+ 'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation'
+]
diff --git a/paddlevideo/loader/pipelines/anet_pipeline.py b/paddlevideo/loader/pipelines/anet_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..210d733b7e91eb580e151de68b550d7b9b2ae5f6
--- /dev/null
+++ b/paddlevideo/loader/pipelines/anet_pipeline.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from ..registry import PIPELINES
+"""pipeline ops for Activity Net.
+"""
+
+
+@PIPELINES.register()
+class LoadFeat(object):
+ def __init__(self, feat_path):
+ self.feat_path = feat_path
+
+ def __call__(self, results):
+ video_name = results['video_name']
+ file_name = video_name + ".npy"
+ file_path = os.path.join(self.feat_path, file_name)
+ #TODO: check path
+ video_feat = np.load(file_path)
+ video_feat = video_feat.T
+ video_feat = video_feat.astype("float32")
+ results['video_feat'] = video_feat
+ return results
+
+
+@PIPELINES.register()
+class GetMatchMap(object):
+ def __init__(self, tscale):
+ self.tscale = tscale
+ self.tgap = 1. / self.tscale
+
+ def __call__(self, results):
+ match_map = []
+ for idx in range(self.tscale):
+ tmp_match_window = []
+ xmin = self.tgap * idx
+ for jdx in range(1, self.tscale + 1):
+ xmax = xmin + self.tgap * jdx
+ tmp_match_window.append([xmin, xmax])
+ match_map.append(tmp_match_window)
+ match_map = np.array(match_map)
+ match_map = np.transpose(match_map, [1, 0, 2])
+ match_map = np.reshape(match_map, [-1, 2])
+
+ anchor_xmin = [self.tgap * i for i in range(self.tscale)]
+ anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
+
+ results['match_map'] = match_map
+ results['anchor_xmin'] = anchor_xmin
+ results['anchor_xmax'] = anchor_xmax
+ return results
+
+
+@PIPELINES.register()
+class GetVideoLabel(object):
+ def __init__(self, tscale, dscale, datatype="float32"):
+ self.tscale = tscale
+ self.dscale = dscale
+ self.tgap = 1. / self.tscale
+ self.datatype = datatype
+
+ def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+ """Compute jaccard score between a box and the anchors.
+ """
+ len_anchors = anchors_max - anchors_min
+ int_xmin = np.maximum(anchors_min, box_min)
+ int_xmax = np.minimum(anchors_max, box_max)
+ inter_len = np.maximum(int_xmax - int_xmin, 0.)
+ union_len = len_anchors - inter_len + box_max - box_min
+ jaccard = np.divide(inter_len, union_len)
+ return jaccard
+
+ def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+ """Compute intersection between score a box and the anchors.
+ """
+ len_anchors = anchors_max - anchors_min
+ int_xmin = np.maximum(anchors_min, box_min)
+ int_xmax = np.minimum(anchors_max, box_max)
+ inter_len = np.maximum(int_xmax - int_xmin, 0.)
+ scores = np.divide(inter_len, len_anchors)
+ return scores
+
+ def __call__(self, results):
+ video_info = results['video_info']
+ match_map = results['match_map']
+ anchor_xmin = results['anchor_xmin']
+ anchor_xmax = results['anchor_xmax']
+
+ video_second = video_info['duration_second']
+ video_labels = video_info['annotations']
+
+ gt_bbox = []
+ gt_iou_map = []
+ for gt in video_labels:
+ tmp_start = max(min(1, gt["segment"][0] / video_second), 0)
+ tmp_end = max(min(1, gt["segment"][1] / video_second), 0)
+ gt_bbox.append([tmp_start, tmp_end])
+ tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],
+ match_map[:, 1], tmp_start,
+ tmp_end)
+ tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
+ [self.dscale, self.tscale])
+ gt_iou_map.append(tmp_gt_iou_map)
+ gt_iou_map = np.array(gt_iou_map)
+ gt_iou_map = np.max(gt_iou_map, axis=0)
+
+ gt_bbox = np.array(gt_bbox)
+ gt_xmins = gt_bbox[:, 0]
+ gt_xmaxs = gt_bbox[:, 1]
+ gt_len_small = 3 * self.tgap
+ gt_start_bboxs = np.stack(
+ (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
+ gt_end_bboxs = np.stack(
+ (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
+
+ match_score_start = []
+ for jdx in range(len(anchor_xmin)):
+ match_score_start.append(
+ np.max(
+ self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+ gt_start_bboxs[:, 0],
+ gt_start_bboxs[:, 1])))
+ match_score_end = []
+ for jdx in range(len(anchor_xmin)):
+ match_score_end.append(
+ np.max(
+ self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+ gt_end_bboxs[:, 0], gt_end_bboxs[:,
+ 1])))
+
+ gt_start = np.array(match_score_start)
+ gt_end = np.array(match_score_end)
+
+ results['gt_iou_map'] = gt_iou_map.astype(self.datatype)
+ results['gt_start'] = gt_start.astype(self.datatype)
+ results['gt_end'] = gt_end.astype(self.datatype)
+ return results
diff --git a/paddlevideo/loader/pipelines/augmentations.py b/paddlevideo/loader/pipelines/augmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a9b3cefbe3563c9cb5754da3aa8eac7b738f3dd
--- /dev/null
+++ b/paddlevideo/loader/pipelines/augmentations.py
@@ -0,0 +1,1026 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+from collections.abc import Sequence
+
+import cv2
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Scale(object):
+ """
+ Scale images.
+ Args:
+ short_size(float | int): Short size of an image will be scaled to the short_size.
+ fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
+ do_round(bool): Whether to round up when calculating the zoom ratio. default: False
+ backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
+ """
+ def __init__(self,
+ short_size,
+ fixed_ratio=True,
+ keep_ratio=None,
+ do_round=False,
+ backend='pillow'):
+ self.short_size = short_size
+ assert (fixed_ratio and not keep_ratio) or (not fixed_ratio), \
+ f"fixed_ratio and keep_ratio cannot be true at the same time"
+ self.fixed_ratio = fixed_ratio
+ self.keep_ratio = keep_ratio
+ self.do_round = do_round
+
+ assert backend in [
+ 'pillow', 'cv2'
+ ], f"Scale's backend must be pillow or cv2, but get {backend}"
+ self.backend = backend
+
+ def __call__(self, results):
+ """
+ Performs resize operations.
+ Args:
+ imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+ For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+ return:
+ resized_imgs: List where each item is a PIL.Image after scaling.
+ """
+ imgs = results['imgs']
+ resized_imgs = []
+ for i in range(len(imgs)):
+ img = imgs[i]
+ if isinstance(img, np.ndarray):
+ h, w, _ = img.shape
+ elif isinstance(img, Image.Image):
+ w, h = img.size
+ else:
+ raise NotImplementedError
+
+ if w <= h:
+ ow = self.short_size
+ if self.fixed_ratio:
+ oh = int(self.short_size * 4.0 / 3.0)
+ elif self.keep_ratio is False:
+ oh = self.short_size
+ else:
+ scale_factor = self.short_size / w
+ oh = int(h * float(scale_factor) +
+ 0.5) if self.do_round else int(h *
+ self.short_size / w)
+ ow = int(w * float(scale_factor) +
+ 0.5) if self.do_round else self.short_size
+ else:
+ oh = self.short_size
+ if self.fixed_ratio:
+ ow = int(self.short_size * 4.0 / 3.0)
+ elif self.keep_ratio is False:
+ ow = self.short_size
+ else:
+ scale_factor = self.short_size / h
+ oh = int(h * float(scale_factor) +
+ 0.5) if self.do_round else self.short_size
+ ow = int(w * float(scale_factor) +
+ 0.5) if self.do_round else int(w *
+ self.short_size / h)
+ if self.backend == 'pillow':
+ resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+ elif self.backend == 'cv2' and (self.keep_ratio is not None):
+ resized_imgs.append(
+ cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR))
+ else:
+ resized_imgs.append(
+ Image.fromarray(
+ cv2.resize(np.asarray(img), (ow, oh),
+ interpolation=cv2.INTER_LINEAR)))
+ results['imgs'] = resized_imgs
+ return results
+
+
+@PIPELINES.register()
+class RandomCrop(object):
+ """
+ Random crop images.
+ Args:
+ target_size(int): Random crop a square with the target_size from an image.
+ """
+ def __init__(self, target_size):
+ self.target_size = target_size
+
+ def __call__(self, results):
+ """
+ Performs random crop operations.
+ Args:
+ imgs: List where each item is a PIL.Image.
+ For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+ return:
+ crop_imgs: List where each item is a PIL.Image after random crop.
+ """
+ imgs = results['imgs']
+ if 'backend' in results and results['backend'] == 'pyav': # [c,t,h,w]
+ h, w = imgs.shape[2:]
+ else:
+ w, h = imgs[0].size
+ th, tw = self.target_size, self.target_size
+
+ assert (w >= self.target_size) and (h >= self.target_size), \
+ "image width({}) and height({}) should be larger than crop size".format(
+ w, h, self.target_size)
+
+ crop_images = []
+ if 'backend' in results and results['backend'] == 'pyav':
+ x1 = np.random.randint(0, w - tw)
+ y1 = np.random.randint(0, h - th)
+ crop_images = imgs[:, :, y1:y1 + th, x1:x1 + tw] # [C, T, th, tw]
+ else:
+ x1 = random.randint(0, w - tw)
+ y1 = random.randint(0, h - th)
+ for img in imgs:
+ if w == tw and h == th:
+ crop_images.append(img)
+ else:
+ crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+ results['imgs'] = crop_images
+ return results
+
+
+@PIPELINES.register()
+class RandomResizedCrop(RandomCrop):
+ def __init__(self,
+ area_range=(0.08, 1.0),
+ aspect_ratio_range=(3 / 4, 4 / 3),
+ target_size=224,
+ backend='cv2'):
+
+ self.area_range = area_range
+ self.aspect_ratio_range = aspect_ratio_range
+ self.target_size = target_size
+ self.backend = backend
+
+ @staticmethod
+ def get_crop_bbox(img_shape,
+ area_range,
+ aspect_ratio_range,
+ max_attempts=10):
+
+ assert 0 < area_range[0] <= area_range[1] <= 1
+ assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]
+
+ img_h, img_w = img_shape
+ area = img_h * img_w
+
+ min_ar, max_ar = aspect_ratio_range
+ aspect_ratios = np.exp(
+ np.random.uniform(np.log(min_ar), np.log(max_ar),
+ size=max_attempts))
+ target_areas = np.random.uniform(*area_range, size=max_attempts) * area
+ candidate_crop_w = np.round(np.sqrt(target_areas *
+ aspect_ratios)).astype(np.int32)
+ candidate_crop_h = np.round(np.sqrt(target_areas /
+ aspect_ratios)).astype(np.int32)
+
+ for i in range(max_attempts):
+ crop_w = candidate_crop_w[i]
+ crop_h = candidate_crop_h[i]
+ if crop_h <= img_h and crop_w <= img_w:
+ x_offset = random.randint(0, img_w - crop_w)
+ y_offset = random.randint(0, img_h - crop_h)
+ return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h
+
+ # Fallback
+ crop_size = min(img_h, img_w)
+ x_offset = (img_w - crop_size) // 2
+ y_offset = (img_h - crop_size) // 2
+ return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size
+
+ def __call__(self, results):
+ imgs = results['imgs']
+ if self.backend == 'pillow':
+ img_w, img_h = imgs[0].size
+ elif self.backend == 'cv2':
+ img_h, img_w, _ = imgs[0].shape
+ elif self.backend == 'pyav':
+ img_h, img_w = imgs.shape[2:] # [cthw]
+ else:
+ raise NotImplementedError
+
+ left, top, right, bottom = self.get_crop_bbox(
+ (img_h, img_w), self.area_range, self.aspect_ratio_range)
+
+ if self.backend == 'pillow':
+ img_w, img_h = imgs[0].size
+ imgs = [img.crop(left, top, right, bottom) for img in imgs]
+ elif self.backend == 'cv2':
+ img_h, img_w, _ = imgs[0].shape
+ imgs = [img[top:bottom, left:right] for img in imgs]
+ elif self.backend == 'pyav':
+ img_h, img_w = imgs.shape[2:] # [cthw]
+ imgs = imgs[:, :, top:bottom, left:right]
+ else:
+ raise NotImplementedError
+ results['imgs'] = imgs
+ return results
+
+
+@PIPELINES.register()
+class CenterCrop(object):
+ """
+ Center crop images.
+ Args:
+ target_size(int): Center crop a square with the target_size from an image.
+ do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True
+ """
+ def __init__(self, target_size, do_round=True, backend='pillow'):
+ self.target_size = target_size
+ self.do_round = do_round
+ self.backend = backend
+
+ def __call__(self, results):
+ """
+ Performs Center crop operations.
+ Args:
+ imgs: List where each item is a PIL.Image.
+ For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+ return:
+ ccrop_imgs: List where each item is a PIL.Image after Center crop.
+ """
+ imgs = results['imgs']
+ ccrop_imgs = []
+ th, tw = self.target_size, self.target_size
+ if isinstance(imgs, paddle.Tensor):
+ h, w = imgs.shape[-2:]
+ x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+ y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2
+ ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]
+ else:
+ for img in imgs:
+ if self.backend == 'pillow':
+ w, h = img.size
+ elif self.backend == 'cv2':
+ h, w, _ = img.shape
+ else:
+ raise NotImplementedError
+ assert (w >= self.target_size) and (h >= self.target_size), \
+ "image width({}) and height({}) should be larger than crop size".format(
+ w, h, self.target_size)
+ x1 = int(round(
+ (w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+ y1 = int(round(
+ (h - th) / 2.0)) if self.do_round else (h - th) // 2
+ if self.backend == 'cv2':
+ ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])
+ elif self.backend == 'pillow':
+ ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+ results['imgs'] = ccrop_imgs
+ return results
+
+
+@PIPELINES.register()
+class MultiScaleCrop(object):
+ """
+ Random crop images in with multiscale sizes
+ Args:
+ target_size(int): Random crop a square with the target_size from an image.
+ scales(int): List of candidate cropping scales.
+ max_distort(int): Maximum allowable deformation combination distance.
+ fix_crop(int): Whether to fix the cutting start point.
+ allow_duplication(int): Whether to allow duplicate candidate crop starting points.
+ more_fix_crop(int): Whether to allow more cutting starting points.
+ """
+ def __init__(
+ self,
+ target_size, # NOTE: named target size now, but still pass short size in it!
+ scales=None,
+ max_distort=1,
+ fix_crop=True,
+ allow_duplication=False,
+ more_fix_crop=True,
+ backend='pillow'):
+
+ self.target_size = target_size
+ self.scales = scales if scales else [1, .875, .75, .66]
+ self.max_distort = max_distort
+ self.fix_crop = fix_crop
+ self.allow_duplication = allow_duplication
+ self.more_fix_crop = more_fix_crop
+ assert backend in [
+ 'pillow', 'cv2'
+ ], f"MultiScaleCrop's backend must be pillow or cv2, but get {backend}"
+ self.backend = backend
+
+ def __call__(self, results):
+ """
+ Performs MultiScaleCrop operations.
+ Args:
+ imgs: List where wach item is a PIL.Image.
+ XXX:
+ results:
+
+ """
+ imgs = results['imgs']
+
+ input_size = [self.target_size, self.target_size]
+
+ im_size = imgs[0].size
+
+ # get random crop offset
+ def _sample_crop_size(im_size):
+ image_w, image_h = im_size[0], im_size[1]
+
+ base_size = min(image_w, image_h)
+ crop_sizes = [int(base_size * x) for x in self.scales]
+ crop_h = [
+ input_size[1] if abs(x - input_size[1]) < 3 else x
+ for x in crop_sizes
+ ]
+ crop_w = [
+ input_size[0] if abs(x - input_size[0]) < 3 else x
+ for x in crop_sizes
+ ]
+
+ pairs = []
+ for i, h in enumerate(crop_h):
+ for j, w in enumerate(crop_w):
+ if abs(i - j) <= self.max_distort:
+ pairs.append((w, h))
+ crop_pair = random.choice(pairs)
+ if not self.fix_crop:
+ w_offset = random.randint(0, image_w - crop_pair[0])
+ h_offset = random.randint(0, image_h - crop_pair[1])
+ else:
+ w_step = (image_w - crop_pair[0]) / 4
+ h_step = (image_h - crop_pair[1]) / 4
+
+ ret = list()
+ ret.append((0, 0)) # upper left
+ if self.allow_duplication or w_step != 0:
+ ret.append((4 * w_step, 0)) # upper right
+ if self.allow_duplication or h_step != 0:
+ ret.append((0, 4 * h_step)) # lower left
+ if self.allow_duplication or (h_step != 0 and w_step != 0):
+ ret.append((4 * w_step, 4 * h_step)) # lower right
+ if self.allow_duplication or (h_step != 0 or w_step != 0):
+ ret.append((2 * w_step, 2 * h_step)) # center
+
+ if self.more_fix_crop:
+ ret.append((0, 2 * h_step)) # center left
+ ret.append((4 * w_step, 2 * h_step)) # center right
+ ret.append((2 * w_step, 4 * h_step)) # lower center
+ ret.append((2 * w_step, 0 * h_step)) # upper center
+
+ ret.append((1 * w_step, 1 * h_step)) # upper left quarter
+ ret.append((3 * w_step, 1 * h_step)) # upper right quarter
+ ret.append((1 * w_step, 3 * h_step)) # lower left quarter
+ ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
+
+ w_offset, h_offset = random.choice(ret)
+
+ return crop_pair[0], crop_pair[1], w_offset, h_offset
+
+ crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)
+ crop_img_group = [
+ img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
+ for img in imgs
+ ]
+ if self.backend == 'pillow':
+ ret_img_group = [
+ img.resize((input_size[0], input_size[1]), Image.BILINEAR)
+ for img in crop_img_group
+ ]
+ else:
+ ret_img_group = [
+ Image.fromarray(
+ cv2.resize(np.asarray(img),
+ dsize=(input_size[0], input_size[1]),
+ interpolation=cv2.INTER_LINEAR))
+ for img in crop_img_group
+ ]
+ results['imgs'] = ret_img_group
+ return results
+
+
+@PIPELINES.register()
+class RandomFlip(object):
+ """
+ Random Flip images.
+ Args:
+ p(float): Random flip images with the probability p.
+ """
+ def __init__(self, p=0.5):
+ self.p = p
+
+ def __call__(self, results):
+ """
+ Performs random flip operations.
+ Args:
+ imgs: List where each item is a PIL.Image.
+ For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+ return:
+ flip_imgs: List where each item is a PIL.Image after random flip.
+ """
+ imgs = results['imgs']
+ v = random.random()
+ if v < self.p:
+ if isinstance(imgs, paddle.Tensor):
+ results['imgs'] = paddle.flip(imgs, axis=[3])
+ elif isinstance(imgs[0], np.ndarray):
+ results['imgs'] = [cv2.flip(img, 1, img) for img in imgs
+ ] # [[h,w,c], [h,w,c], ..., [h,w,c]]
+ else:
+ results['imgs'] = [
+ img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs
+ ]
+ else:
+ results['imgs'] = imgs
+ return results
+
+
+@PIPELINES.register()
+class Image2Array(object):
+ """
+ transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.
+ Args:
+ transpose: whether to transpose or not, default True, False for slowfast.
+ """
+ def __init__(self, transpose=True, data_format='tchw'):
+ assert data_format in [
+ 'tchw', 'cthw'
+ ], f"Target format must in ['tchw', 'cthw'], but got {data_format}"
+ self.transpose = transpose
+ self.data_format = data_format
+
+ def __call__(self, results):
+ """
+ Performs Image to NumpyArray operations.
+ Args:
+ imgs: List where each item is a PIL.Image.
+ For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+ return:
+ np_imgs: Numpy array.
+ """
+ imgs = results['imgs']
+ if 'backend' in results and results[
+ 'backend'] == 'pyav': # [T,H,W,C] in [0, 1]
+ if self.transpose:
+ if self.data_format == 'tchw':
+ t_imgs = imgs.transpose((0, 3, 1, 2)) # tchw
+ else:
+ t_imgs = imgs.transpose((3, 0, 1, 2)) # cthw
+ results['imgs'] = t_imgs
+ else:
+ t_imgs = np.stack(imgs).astype('float32')
+ if self.transpose:
+ if self.data_format == 'tchw':
+ t_imgs = t_imgs.transpose(0, 3, 1, 2) # tchw
+ else:
+ t_imgs = t_imgs.transpose(3, 0, 1, 2) # cthw
+ results['imgs'] = t_imgs
+ return results
+
+
+@PIPELINES.register()
+class Normalization(object):
+ """
+ Normalization.
+ Args:
+ mean(Sequence[float]): mean values of different channels.
+ std(Sequence[float]): std values of different channels.
+ tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]
+ """
+ def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):
+ if not isinstance(mean, Sequence):
+ raise TypeError(
+ f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+ if not isinstance(std, Sequence):
+ raise TypeError(
+ f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+ self.inplace = inplace
+ if not inplace:
+ self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)
+ self.std = np.array(std).reshape(tensor_shape).astype(np.float32)
+ else:
+ self.mean = np.array(mean, dtype=np.float32)
+ self.std = np.array(std, dtype=np.float32)
+
+ def __call__(self, results):
+ """
+ Performs normalization operations.
+ Args:
+ imgs: Numpy array.
+ return:
+ np_imgs: Numpy array after normalization.
+ """
+ if self.inplace:
+ n = len(results['imgs'])
+ h, w, c = results['imgs'][0].shape
+ norm_imgs = np.empty((n, h, w, c), dtype=np.float32)
+ for i, img in enumerate(results['imgs']):
+ norm_imgs[i] = img
+
+ for img in norm_imgs: # [n,h,w,c]
+ mean = np.float64(self.mean.reshape(1, -1)) # [1, 3]
+ stdinv = 1 / np.float64(self.std.reshape(1, -1)) # [1, 3]
+ cv2.subtract(img, mean, img)
+ cv2.multiply(img, stdinv, img)
+ else:
+ imgs = results['imgs']
+ norm_imgs = imgs / 255.0
+ norm_imgs -= self.mean
+ norm_imgs /= self.std
+ if 'backend' in results and results['backend'] == 'pyav':
+ norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)
+ results['imgs'] = norm_imgs
+ return results
+
+
+@PIPELINES.register()
+class JitterScale(object):
+ """
+ Scale image, while the target short size is randomly select between min_size and max_size.
+ Args:
+ min_size: Lower bound for random sampler.
+ max_size: Higher bound for random sampler.
+ """
+ def __init__(self,
+ min_size,
+ max_size,
+ short_cycle_factors=[0.5, 0.7071],
+ default_min_size=256):
+ self.default_min_size = default_min_size
+ self.orig_min_size = self.min_size = min_size
+ self.max_size = max_size
+ self.short_cycle_factors = short_cycle_factors
+
+ def __call__(self, results):
+ """
+ Performs jitter resize operations.
+ Args:
+ imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+ For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+ return:
+ resized_imgs: List where each item is a PIL.Image after scaling.
+ """
+ short_cycle_idx = results.get('short_cycle_idx')
+ if short_cycle_idx in [0, 1]:
+ self.min_size = int(
+ round(self.short_cycle_factors[short_cycle_idx] *
+ self.default_min_size))
+ else:
+ self.min_size = self.orig_min_size
+
+ imgs = results['imgs']
+ size = int(round(np.random.uniform(self.min_size, self.max_size)))
+ assert (len(imgs) >= 1), \
+ "len(imgs):{} should be larger than 1".format(len(imgs))
+
+ if 'backend' in results and results['backend'] == 'pyav':
+ height, width = imgs.shape[2:]
+ else:
+ width, height = imgs[0].size
+ if (width <= height and width == size) or (height <= width
+ and height == size):
+ return results
+
+ new_width = size
+ new_height = size
+ if width < height:
+ new_height = int(math.floor((float(height) / width) * size))
+ else:
+ new_width = int(math.floor((float(width) / height) * size))
+
+ if 'backend' in results and results['backend'] == 'pyav':
+ frames_resize = F.interpolate(imgs,
+ size=(new_height, new_width),
+ mode="bilinear",
+ align_corners=False) # [c,t,h,w]
+ else:
+ frames_resize = []
+ for j in range(len(imgs)):
+ img = imgs[j]
+ scale_img = img.resize((new_width, new_height), Image.BILINEAR)
+ frames_resize.append(scale_img)
+
+ results['imgs'] = frames_resize
+ return results
+
+
+@PIPELINES.register()
+class MultiCrop(object):
+ """
+ Random crop image.
+ This operation can perform multi-crop during multi-clip test, as in slowfast model.
+ Args:
+ target_size(int): Random crop a square with the target_size from an image.
+ """
+ def __init__(self,
+ target_size,
+ default_crop_size=224,
+ short_cycle_factors=[0.5, 0.7071],
+ test_mode=False):
+ self.orig_target_size = self.target_size = target_size
+ self.short_cycle_factors = short_cycle_factors
+ self.default_crop_size = default_crop_size
+ self.test_mode = test_mode
+
+ def __call__(self, results):
+ """
+ Performs random crop operations.
+ Args:
+ imgs: List where each item is a PIL.Image.
+ For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+ return:
+ crop_imgs: List where each item is a PIL.Image after random crop.
+ """
+ imgs = results['imgs']
+ spatial_sample_index = results['spatial_sample_index']
+ spatial_num_clips = results['spatial_num_clips']
+
+ short_cycle_idx = results.get('short_cycle_idx')
+ if short_cycle_idx in [0, 1]:
+ self.target_size = int(
+ round(self.short_cycle_factors[short_cycle_idx] *
+ self.default_crop_size))
+ else:
+ self.target_size = self.orig_target_size # use saved value before call
+
+ w, h = imgs[0].size
+ if w == self.target_size and h == self.target_size:
+ return results
+
+ assert (w >= self.target_size) and (h >= self.target_size), \
+ "image width({}) and height({}) should be larger than crop size({},{})".format(w, h, self.target_size, self.target_size)
+ frames_crop = []
+ if not self.test_mode:
+ x_offset = random.randint(0, w - self.target_size)
+ y_offset = random.randint(0, h - self.target_size)
+ else: # multi-crop
+ x_gap = int(
+ math.ceil((w - self.target_size) / (spatial_num_clips - 1)))
+ y_gap = int(
+ math.ceil((h - self.target_size) / (spatial_num_clips - 1)))
+ if h > w:
+ x_offset = int(math.ceil((w - self.target_size) / 2))
+ if spatial_sample_index == 0:
+ y_offset = 0
+ elif spatial_sample_index == spatial_num_clips - 1:
+ y_offset = h - self.target_size
+ else:
+ y_offset = y_gap * spatial_sample_index
+ else:
+ y_offset = int(math.ceil((h - self.target_size) / 2))
+ if spatial_sample_index == 0:
+ x_offset = 0
+ elif spatial_sample_index == spatial_num_clips - 1:
+ x_offset = w - self.target_size
+ else:
+ x_offset = x_gap * spatial_sample_index
+
+ for img in imgs:
+ nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,
+ y_offset + self.target_size))
+ frames_crop.append(nimg)
+ results['imgs'] = frames_crop
+ return results
+
+
+@PIPELINES.register()
+class PackOutput(object):
+ """
+ In slowfast model, we want to get slow pathway from fast pathway based on
+ alpha factor.
+ Args:
+ alpha(int): temporal length of fast/slow
+ """
+ def __init__(self, alpha):
+ self.alpha = alpha
+
+ def __call__(self, results):
+ fast_pathway = results['imgs']
+
+ # sample num points between start and end
+ slow_idx_start = 0
+ slow_idx_end = fast_pathway.shape[0] - 1
+ slow_idx_num = fast_pathway.shape[0] // self.alpha
+ slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,
+ slow_idx_num).astype("int64")
+ slow_pathway = fast_pathway[slow_idxs_select]
+
+ # T H W C -> C T H W.
+ slow_pathway = slow_pathway.transpose(3, 0, 1, 2)
+ fast_pathway = fast_pathway.transpose(3, 0, 1, 2)
+
+ # slow + fast
+ frames_list = [slow_pathway, fast_pathway]
+ results['imgs'] = frames_list
+ return results
+
+
+@PIPELINES.register()
+class GroupFullResSample(object):
+ def __init__(self, crop_size, flip=False):
+ self.crop_size = crop_size if not isinstance(crop_size, int) else (
+ crop_size, crop_size)
+ self.flip = flip
+
+ def __call__(self, results):
+ img_group = results['imgs']
+
+ image_w, image_h = img_group[0].size
+ crop_w, crop_h = self.crop_size
+
+ w_step = (image_w - crop_w) // 4
+ h_step = (image_h - crop_h) // 4
+
+ offsets = list()
+ offsets.append((0 * w_step, 2 * h_step)) # left
+ offsets.append((4 * w_step, 2 * h_step)) # right
+ offsets.append((2 * w_step, 2 * h_step)) # center
+
+ oversample_group = list()
+ for o_w, o_h in offsets:
+ normal_group = list()
+ flip_group = list()
+ for i, img in enumerate(img_group):
+ crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+ normal_group.append(crop)
+ if self.flip:
+ flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+ flip_group.append(flip_crop)
+
+ oversample_group.extend(normal_group)
+ if self.flip:
+ oversample_group.extend(flip_group)
+
+ results['imgs'] = oversample_group
+ return results
+
+
+@PIPELINES.register()
+class TenCrop:
+ """
+ Crop out 5 regions (4 corner points + 1 center point) from the picture,
+ and then flip the cropping result to get 10 cropped images, which can make the prediction result more robust.
+ Args:
+ target_size(int | tuple[int]): (w, h) of target size for crop.
+ """
+ def __init__(self, target_size):
+ self.target_size = (target_size, target_size)
+
+ def __call__(self, results):
+ imgs = results['imgs']
+ img_w, img_h = imgs[0].size
+ crop_w, crop_h = self.target_size
+ w_step = (img_w - crop_w) // 4
+ h_step = (img_h - crop_h) // 4
+ offsets = [
+ (0, 0),
+ (4 * w_step, 0),
+ (0, 4 * h_step),
+ (4 * w_step, 4 * h_step),
+ (2 * w_step, 2 * h_step),
+ ]
+ img_crops = list()
+ for x_offset, y_offset in offsets:
+ crop = [
+ img.crop(
+ (x_offset, y_offset, x_offset + crop_w, y_offset + crop_h))
+ for img in imgs
+ ]
+ crop_fliped = [
+ timg.transpose(Image.FLIP_LEFT_RIGHT) for timg in crop
+ ]
+ img_crops.extend(crop)
+ img_crops.extend(crop_fliped)
+
+ results['imgs'] = img_crops
+ return results
+
+
+@PIPELINES.register()
+class UniformCrop:
+ """
+ Perform uniform spatial sampling on the images,
+ select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions.
+ Args:
+ target_size(int | tuple[int]): (w, h) of target size for crop.
+ """
+ def __init__(self, target_size, backend='cv2'):
+ if isinstance(target_size, tuple):
+ self.target_size = target_size
+ elif isinstance(target_size, int):
+ self.target_size = (target_size, target_size)
+ else:
+ raise TypeError(
+ f'target_size must be int or tuple[int], but got {type(target_size)}'
+ )
+ self.backend = backend
+
+ def __call__(self, results):
+
+ imgs = results['imgs']
+ if 'backend' in results and results['backend'] == 'pyav': # [c,t,h,w]
+ img_h, img_w = imgs.shape[2:]
+ elif self.backend == 'pillow':
+ img_w, img_h = imgs[0].size
+ else:
+ img_h, img_w = imgs[0].shape[:2]
+
+ crop_w, crop_h = self.target_size
+ if crop_h == img_h:
+ w_step = (img_w - crop_w) // 2
+ offsets = [
+ (0, 0),
+ (w_step * 2, 0),
+ (w_step, 0),
+ ]
+ elif crop_w == img_w:
+ h_step = (img_h - crop_h) // 2
+ offsets = [
+ (0, 0),
+ (0, h_step * 2),
+ (0, h_step),
+ ]
+ else:
+ raise ValueError(
+ f"img_w({img_w}) == crop_w({crop_w}) or img_h({img_h}) == crop_h({crop_h})"
+ )
+ img_crops = []
+ if 'backend' in results and results['backend'] == 'pyav': # [c,t,h,w]
+ for x_offset, y_offset in offsets:
+ crop = imgs[:, :, y_offset:y_offset + crop_h,
+ x_offset:x_offset + crop_w]
+ img_crops.append(crop)
+ img_crops = paddle.concat(img_crops, axis=1)
+ else:
+ if self.backend == 'pillow':
+ for x_offset, y_offset in offsets:
+ crop = [
+ img.crop((x_offset, y_offset, x_offset + crop_w,
+ y_offset + crop_h)) for img in imgs
+ ]
+ img_crops.extend(crop)
+ else:
+ for x_offset, y_offset in offsets:
+ crop = [
+ img[y_offset:y_offset + crop_h,
+ x_offset:x_offset + crop_w] for img in imgs
+ ]
+ img_crops.extend(crop)
+ results['imgs'] = img_crops
+ return results
+
+
+@PIPELINES.register()
+class GroupResize(object):
+ def __init__(self, height, width, scale, K, mode='train'):
+ self.height = height
+ self.width = width
+ self.scale = scale
+ self.resize = {}
+ self.K = np.array(K, dtype=np.float32)
+ self.mode = mode
+ for i in range(self.scale):
+ s = 2**i
+ self.resize[i] = paddle.vision.transforms.Resize(
+ (self.height // s, self.width // s), interpolation='lanczos')
+
+ def __call__(self, results):
+ if self.mode == 'infer':
+ imgs = results['imgs']
+ for k in list(imgs): # ("color", 0, -1)
+ if "color" in k or "color_n" in k:
+ n, im, _ = k
+ for i in range(self.scale):
+ imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])
+ else:
+ imgs = results['imgs']
+ for scale in range(self.scale):
+ K = self.K.copy()
+
+ K[0, :] *= self.width // (2**scale)
+ K[1, :] *= self.height // (2**scale)
+
+ inv_K = np.linalg.pinv(K)
+ imgs[("K", scale)] = K
+ imgs[("inv_K", scale)] = inv_K
+
+ for k in list(imgs):
+ if "color" in k or "color_n" in k:
+ n, im, i = k
+ for i in range(self.scale):
+ imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])
+
+ results['imgs'] = imgs
+ return results
+
+
+@PIPELINES.register()
+class ColorJitter(object):
+ """Randomly change the brightness, contrast, saturation and hue of an image.
+ """
+ def __init__(self,
+ brightness=0,
+ contrast=0,
+ saturation=0,
+ hue=0,
+ mode='train',
+ p=0.5,
+ keys=None):
+ self.mode = mode
+ self.colorjitter = paddle.vision.transforms.ColorJitter(
+ brightness, contrast, saturation, hue)
+ self.p = p
+
+ def __call__(self, results):
+ """
+ Args:
+ results (PIL Image): Input image.
+
+ Returns:
+ PIL Image: Color jittered image.
+ """
+
+ do_color_aug = random.random() > self.p
+ imgs = results['imgs']
+ for k in list(imgs):
+ f = imgs[k]
+ if "color" in k or "color_n" in k:
+ n, im, i = k
+ imgs[(n, im, i)] = f
+ if do_color_aug:
+ imgs[(n + "_aug", im, i)] = self.colorjitter(f)
+ else:
+ imgs[(n + "_aug", im, i)] = f
+ if self.mode == "train":
+ for i in results['frame_idxs']:
+ del imgs[("color", i, -1)]
+ del imgs[("color_aug", i, -1)]
+ del imgs[("color_n", i, -1)]
+ del imgs[("color_n_aug", i, -1)]
+ else:
+ for i in results['frame_idxs']:
+ del imgs[("color", i, -1)]
+ del imgs[("color_aug", i, -1)]
+
+ results['img'] = imgs
+ return results
+
+
+@PIPELINES.register()
+class GroupRandomFlip(object):
+ def __init__(self, p=0.5):
+ self.p = p
+
+ def __call__(self, results):
+
+ imgs = results['imgs']
+ do_flip = random.random() > self.p
+ if do_flip:
+ for k in list(imgs):
+ if "color" in k or "color_n" in k:
+ n, im, i = k
+ imgs[(n, im,
+ i)] = imgs[(n, im,
+ i)].transpose(Image.FLIP_LEFT_RIGHT)
+ if "depth_gt" in imgs:
+ imgs['depth_gt'] = np.array(np.fliplr(imgs['depth_gt']))
+
+ results['imgs'] = imgs
+ return results
+
+
+@PIPELINES.register()
+class ToArray(object):
+ def __init__(self):
+ pass
+
+ def __call__(self, results):
+ imgs = results['imgs']
+ for k in list(imgs):
+ if "color" in k or "color_n" in k or "color_aug" in k or "color_n_aug" in k:
+ n, im, i = k
+ imgs[(n, im,
+ i)] = np.array(imgs[(n, im, i)]).astype('float32') / 255.0
+ imgs[(n, im, i)] = imgs[(n, im, i)].transpose((2, 0, 1))
+ if "depth_gt" in imgs:
+ imgs['depth_gt'] = np.array(imgs['depth_gt']).astype('float32')
+
+ results['imgs'] = imgs
+ return results
diff --git a/paddlevideo/loader/pipelines/augmentations_ava.py b/paddlevideo/loader/pipelines/augmentations_ava.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7cbe3c39a4463f9ace01d8dd611b8447e48a472
--- /dev/null
+++ b/paddlevideo/loader/pipelines/augmentations_ava.py
@@ -0,0 +1,730 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+import math
+from PIL import Image
+from ..registry import PIPELINES
+from collections.abc import Sequence
+import cv2
+
+pillow_interp_codes = {
+ 'nearest': Image.NEAREST,
+ 'bilinear': Image.BILINEAR,
+ 'bicubic': Image.BICUBIC,
+ 'box': Image.BOX,
+ 'lanczos': Image.LANCZOS,
+ 'hamming': Image.HAMMING
+}
+
+cv2_interp_codes = {
+ 'nearest': cv2.INTER_NEAREST,
+ 'bilinear': cv2.INTER_LINEAR,
+ 'bicubic': cv2.INTER_CUBIC,
+ 'area': cv2.INTER_AREA,
+ 'lanczos': cv2.INTER_LANCZOS4
+}
+
+def _init_lazy_if_proper(results, lazy):
+ """Initialize lazy operation properly.
+
+ Make sure that a lazy operation is properly initialized,
+ and avoid a non-lazy operation accidentally getting mixed in.
+
+ Required keys in results are "imgs" if "img_shape" not in results,
+ otherwise, Required keys in results are "img_shape", add or modified keys
+ are "img_shape", "lazy".
+ Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
+ "flip_direction", "interpolation".
+
+ Args:
+ results (dict): A dict stores data pipeline result.
+ lazy (bool): Determine whether to apply lazy operation. Default: False.
+ """
+
+ if 'img_shape' not in results:
+ results['img_shape'] = results['imgs'][0].shape[:2]
+ if lazy:
+ if 'lazy' not in results:
+ img_h, img_w = results['img_shape']
+ lazyop = dict()
+ lazyop['original_shape'] = results['img_shape']
+ lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
+ dtype=np.float32)
+ lazyop['flip'] = False
+ lazyop['flip_direction'] = None
+ lazyop['interpolation'] = None
+ results['lazy'] = lazyop
+ else:
+ assert 'lazy' not in results, 'Use Fuse after lazy operations'
+
+def _scale_size(size, scale):
+ """Rescale a size by a ratio.
+
+ Args:
+ size (tuple[int]): (w, h).
+ scale (float): Scaling factor.
+
+ Returns:
+ tuple[int]: scaled size.
+ """
+ w, h = size
+ return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)
+
+def rescale_size(old_size, scale, return_scale=False):
+ """Calculate the new size to be rescaled to.
+
+ Args:
+ old_size (tuple[int]): The old size (w, h) of image.
+ scale (float | tuple[int]): The scaling factor or maximum size.
+ If it is a float number, then the image will be rescaled by this
+ factor, else if it is a tuple of 2 integers, then the image will
+ be rescaled as large as possible within the scale.
+ return_scale (bool): Whether to return the scaling factor besides the
+ rescaled image size.
+
+ Returns:
+ tuple[int]: The new rescaled image size.
+ """
+ w, h = old_size
+ if isinstance(scale, (float, int)):
+ if scale <= 0:
+ raise ValueError(f'Invalid scale {scale}, must be positive.')
+ scale_factor = scale
+ elif isinstance(scale, tuple):
+ max_long_edge = max(scale)
+ max_short_edge = min(scale)
+ scale_factor = min(max_long_edge / max(h, w),
+ max_short_edge / min(h, w))
+ else:
+ raise TypeError(
+ f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+ new_size = _scale_size((w, h), scale_factor)
+
+ if return_scale:
+ return new_size, scale_factor
+ else:
+ return new_size
+
+def imresize(img,
+ size,
+ return_scale=False,
+ interpolation='bilinear',
+ out=None,
+ backend=None):
+ """Resize image to a given size. """
+ h, w = img.shape[:2]
+ if backend is None:
+ backend = 'cv2'
+ if backend not in ['cv2', 'pillow']:
+ raise ValueError(f'backend: {backend} is not supported for resize.'
+ f"Supported backends are 'cv2', 'pillow'")
+
+ if backend == 'pillow':
+ assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+ pil_image = Image.fromarray(img)
+ pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+ resized_img = np.array(pil_image)
+ else:
+ resized_img = cv2.resize(
+ img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+ if not return_scale:
+ return resized_img
+ else:
+ w_scale = size[0] / w
+ h_scale = size[1] / h
+ return resized_img, w_scale, h_scale
+
+@PIPELINES.register()
+class EntityBoxRescale:
+ """Rescale the entity box and proposals according to the image shape.
+
+ Required keys are "proposals", "gt_bboxes", added or modified keys are
+ "gt_bboxes". If original "proposals" is not None, "proposals" and
+ will be added or modified.
+
+ Args:
+ scale_factor (np.ndarray): The scale factor used entity_box rescaling.
+ """
+
+ def __init__(self, scale_factor):
+ self.scale_factor = scale_factor
+
+ def __call__(self, results):
+ scale_factor = np.concatenate([self.scale_factor, self.scale_factor])
+
+ if 'gt_bboxes' in results:
+ gt_bboxes = results['gt_bboxes']
+ results['gt_bboxes'] = gt_bboxes * scale_factor
+
+ if 'proposals' in results:
+ proposals = results['proposals']
+ if proposals is not None:
+ assert proposals.shape[1] == 4, (
+ 'proposals shape should be in '
+ f'(n, 4), but got {proposals.shape}')
+ results['proposals'] = proposals * scale_factor
+
+ return results
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'
+
+@PIPELINES.register()
+class EntityBoxCrop:
+ """Crop the entity boxes and proposals according to the cropped images.
+
+ Required keys are "proposals", "gt_bboxes", added or modified keys are
+ "gt_bboxes". If original "proposals" is not None, "proposals" will be
+ modified.
+
+ Args:
+ crop_bbox(np.ndarray | None): The bbox used to crop the original image.
+ """
+
+ def __init__(self, crop_bbox):
+ self.crop_bbox = crop_bbox
+
+ def __call__(self, results):
+ proposals = results['proposals']
+ gt_bboxes = results['gt_bboxes']
+
+ if self.crop_bbox is None:
+ return results
+
+ x1, y1, x2, y2 = self.crop_bbox
+ img_w, img_h = x2 - x1, y2 - y1
+
+ assert gt_bboxes.shape[-1] == 4
+ gt_bboxes_ = gt_bboxes.copy()
+ gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)
+ gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)
+ results['gt_bboxes'] = gt_bboxes_
+
+ if proposals is not None:
+ assert proposals.shape[-1] == 4
+ proposals_ = proposals.copy()
+ proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0, img_w - 1)
+ proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0, img_h - 1)
+ results['proposals'] = proposals_
+ return results
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'
+
+@PIPELINES.register()
+class EntityBoxFlip:
+ """Flip the entity boxes and proposals with a probability.
+
+ Reverse the order of elements in the given bounding boxes and proposals
+ with a specific direction. The shape of them are preserved, but the
+ elements are reordered. Only the horizontal flip is supported (seems
+ vertical flipping makes no sense). Required keys are "proposals",
+ "gt_bboxes", added or modified keys are "gt_bboxes". If "proposals"
+ is not None, it will also be modified.
+
+ Args:
+ img_shape (tuple[int]): The img shape.
+ """
+
+ def __init__(self, img_shape):
+ self.img_shape = img_shape
+
+ def __call__(self, results):
+ proposals = results['proposals']
+ gt_bboxes = results['gt_bboxes']
+ img_h, img_w = self.img_shape
+
+ assert gt_bboxes.shape[-1] == 4
+ gt_bboxes_ = gt_bboxes.copy()
+ gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1
+ gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1
+ if proposals is not None:
+ assert proposals.shape[-1] == 4
+ proposals_ = proposals.copy()
+ proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1
+ proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1
+ else:
+ proposals_ = None
+
+ results['proposals'] = proposals_
+ results['gt_bboxes'] = gt_bboxes_
+
+ return results
+
+ def __repr__(self):
+ repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'
+ return repr_str
+
+
+@PIPELINES.register()
+class Resize:
+ """Resize images to a specific size.
+
+ Required keys are "imgs", "img_shape", "modality", added or modified
+ keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy",
+ "resize_size". Required keys in "lazy" is None, added or modified key is
+ "interpolation".
+
+ Args:
+ scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
+ factor or maximum size:
+ If it is a float number, the image will be rescaled by this
+ factor, else if it is a tuple of 2 integers, the image will
+ be rescaled as large as possible within the scale.
+ Otherwise, it serves as (w, h) of output size.
+ keep_ratio (bool): If set to True, Images will be resized without
+ changing the aspect ratio. Otherwise, it will resize images to a
+ given size. Default: True.
+ interpolation (str): Algorithm used for interpolation:
+ "nearest" | "bilinear". Default: "bilinear".
+ lazy (bool): Determine whether to apply lazy operation. Default: False.
+ """
+
+ def __init__(self,
+ scale,
+ keep_ratio=True,
+ interpolation='bilinear',
+ lazy=False):
+ if isinstance(scale, float):
+ if scale <= 0:
+ raise ValueError(f'Invalid scale {scale}, must be positive.')
+ elif isinstance(scale, tuple):
+ max_long_edge = max(scale)
+ max_short_edge = min(scale)
+ if max_short_edge == -1:
+ # assign np.inf to long edge for rescaling short edge later.
+ scale = (np.inf, max_long_edge)
+ else:
+ raise TypeError(
+ f'Scale must be float or tuple of int, but got {type(scale)}')
+ self.scale = scale
+ self.keep_ratio = keep_ratio
+ self.interpolation = interpolation
+ self.lazy = lazy
+
+ def __call__(self, results):
+ """Performs the Resize augmentation.
+
+ Args:
+ results (dict): The resulting dict to be modified and passed
+ to the next transform in pipeline.
+ """
+
+ _init_lazy_if_proper(results, self.lazy)
+
+ if 'scale_factor' not in results:
+ results['scale_factor'] = np.array([1, 1], dtype=np.float32)
+ img_h, img_w = results['img_shape']
+
+ if self.keep_ratio:
+ new_w, new_h = rescale_size((img_w, img_h), self.scale)
+ else:
+ new_w, new_h = self.scale
+
+ self.scale_factor = np.array([new_w / img_w, new_h / img_h],
+ dtype=np.float32)
+ results['img_shape'] = (new_h, new_w)
+ results['keep_ratio'] = self.keep_ratio
+ results['scale_factor'] = results['scale_factor'] * self.scale_factor
+
+
+ if not self.lazy:
+ results['imgs'] = [
+ imresize(
+ img, (new_w, new_h), interpolation=self.interpolation)
+ for img in results['imgs']
+ ]
+ else:
+ lazyop = results['lazy']
+ if lazyop['flip']:
+ raise NotImplementedError('Put Flip at last for now')
+ lazyop['interpolation'] = self.interpolation
+
+ #if 'gt_bboxes' in results:
+ assert not self.lazy
+ entity_box_rescale = EntityBoxRescale(self.scale_factor)
+ results = entity_box_rescale(results)
+
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'scale={self.scale}, keep_ratio={self.keep_ratio}, '
+ f'interpolation={self.interpolation}, '
+ f'lazy={self.lazy})')
+ return repr_str
+
+@PIPELINES.register()
+class RandomRescale:
+ """Randomly resize images so that the short_edge is resized to a specific
+ size in a given range. The scale ratio is unchanged after resizing.
+ """
+
+ def __init__(self, scale_range, interpolation='bilinear'):
+ scale_range = eval(scale_range)
+ self.scale_range = scale_range
+
+ assert len(scale_range) == 2
+ assert scale_range[0] < scale_range[1]
+ assert np.all([x > 0 for x in scale_range])
+
+ self.keep_ratio = True
+ self.interpolation = interpolation
+
+ def __call__(self, results):
+ """Performs the Resize augmentation.
+
+ Args:
+ results (dict): The resulting dict to be modified and passed
+ to the next transform in pipeline.
+ """
+ short_edge = np.random.randint(self.scale_range[0],
+ self.scale_range[1] + 1)
+ resize = Resize((-1, short_edge),
+ keep_ratio=True,
+ interpolation=self.interpolation,
+ lazy=False)
+ results = resize(results)
+
+ results['short_edge'] = short_edge
+ return results
+
+ def __repr__(self):
+ scale_range = self.scale_range
+ repr_str = (f'{self.__class__.__name__}('
+ f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+ f'interpolation={self.interpolation})')
+ return repr_str
+
+@PIPELINES.register()
+class Rescale:
+ """resize images so that the short_edge is resized to a specific
+ size in a given range. The scale ratio is unchanged after resizing.
+
+ Required keys are "imgs", "img_shape", "modality", added or modified
+ keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size",
+ "short_edge".
+
+ Args:
+ scale_range (tuple[int]): The range of short edge length. A closed
+ interval.
+ interpolation (str): Algorithm used for interpolation:
+ "nearest" | "bilinear". Default: "bilinear".
+ """
+
+ def __init__(self, scale_range, interpolation='bilinear'):
+ scale_range = eval(scale_range)
+ self.scale_range = scale_range
+
+ self.keep_ratio = True
+ self.interpolation = interpolation
+
+ def __call__(self, results):
+ """Performs the Resize augmentation.
+
+ Args:
+ results (dict): The resulting dict to be modified and passed
+ to the next transform in pipeline.
+ """
+ resize = Resize(self.scale_range,
+ keep_ratio=True,
+ interpolation=self.interpolation,
+ lazy=False)
+ results = resize(results)
+ return results
+
+ def __repr__(self):
+ scale_range = self.scale_range
+ repr_str = (f'{self.__class__.__name__}('
+ f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+ f'interpolation={self.interpolation})')
+ return repr_str
+
+
+@PIPELINES.register()
+class RandomCrop_v2:
+ """Vanilla square random crop that specifics the output size.
+
+ Required keys in results are "imgs" and "img_shape", added or
+ modified keys are "imgs", "lazy"; Required keys in "lazy" are "flip",
+ "crop_bbox", added or modified key is "crop_bbox".
+
+ Args:
+ size (int): The output size of the images.
+ lazy (bool): Determine whether to apply lazy operation. Default: False.
+ """
+
+ def __init__(self, size, lazy=False):
+ if not isinstance(size, int):
+ raise TypeError(f'Size must be an int, but got {type(size)}')
+ self.size = size
+ self.lazy = lazy
+
+ def __call__(self, results):
+ """Performs the RandomCrop augmentation.
+
+ Args:
+ results (dict): The resulting dict to be modified and passed
+ to the next transform in pipeline.
+ """
+ _init_lazy_if_proper(results, self.lazy)
+
+ img_h, img_w = results['img_shape']
+ assert self.size <= img_h and self.size <= img_w
+
+ y_offset = 0
+ x_offset = 0
+ if img_h > self.size:
+ y_offset = int(np.random.randint(0, img_h - self.size))
+ if img_w > self.size:
+ x_offset = int(np.random.randint(0, img_w - self.size))
+ if 'crop_quadruple' not in results:
+ results['crop_quadruple'] = np.array(
+ [0, 0, 1, 1], # x, y, w, h
+ dtype=np.float32)
+
+ x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
+ w_ratio, h_ratio = self.size / img_w, self.size / img_h
+
+ old_crop_quadruple = results['crop_quadruple']
+ old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+ old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+ new_crop_quadruple = [
+ old_x_ratio + x_ratio * old_w_ratio,
+ old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+ h_ratio * old_x_ratio
+ ]
+ results['crop_quadruple'] = np.array( new_crop_quadruple, dtype=np.float32)
+
+ new_h, new_w = self.size, self.size
+
+ results['crop_bbox'] = np.array( [x_offset, y_offset, x_offset + new_w, y_offset + new_h])
+ results['img_shape'] = (new_h, new_w)
+
+ if not self.lazy:
+ results['imgs'] = [
+ img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
+ for img in results['imgs']
+ ]
+ else:
+ lazyop = results['lazy']
+ if lazyop['flip']:
+ raise NotImplementedError('Put Flip at last for now')
+
+ # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+ lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+ left = x_offset * (lazy_right - lazy_left) / img_w
+ right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
+ top = y_offset * (lazy_bottom - lazy_top) / img_h
+ bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
+ lazyop['crop_bbox'] = np.array([(lazy_left + left),
+ (lazy_top + top),
+ (lazy_left + right),
+ (lazy_top + bottom)],
+ dtype=np.float32)
+
+ # Process entity boxes
+ if 'gt_bboxes' in results:
+ assert not self.lazy
+ entity_box_crop = EntityBoxCrop(results['crop_bbox'])
+ results = entity_box_crop(results)
+
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}(size={self.size}, '
+ f'lazy={self.lazy})')
+ return repr_str
+
+def imflip_(img, direction='horizontal'):
+ """Inplace flip an image horizontally or vertically.
+
+ Args:
+ img (ndarray): Image to be flipped.
+ direction (str): The flip direction, either "horizontal" or
+ "vertical" or "diagonal".
+
+ Returns:
+ ndarray: The flipped image (inplace).
+ """
+ assert direction in ['horizontal', 'vertical', 'diagonal']
+ if direction == 'horizontal':
+ return cv2.flip(img, 1, img)
+ elif direction == 'vertical':
+ return cv2.flip(img, 0, img)
+ else:
+ return cv2.flip(img, -1, img)
+
+def iminvert(img):
+ """Invert (negate) an image.
+
+ Args:
+ img (ndarray): Image to be inverted.
+
+ Returns:
+ ndarray: The inverted image.
+ """
+ return np.full_like(img, 255) - img
+
+@PIPELINES.register()
+class Flip:
+ """Flip the input images with a probability.
+
+ Reverse the order of elements in the given imgs with a specific direction.
+ The shape of the imgs is preserved, but the elements are reordered.
+ Required keys are "imgs", "img_shape", "modality", added or modified
+ keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is
+ None, added or modified key are "flip" and "flip_direction". The Flip
+ augmentation should be placed after any cropping / reshaping augmentations,
+ to make sure crop_quadruple is calculated properly.
+
+ Args:
+ flip_ratio (float): Probability of implementing flip. Default: 0.5.
+ direction (str): Flip imgs horizontally or vertically. Options are
+ "horizontal" | "vertical". Default: "horizontal".
+ lazy (bool): Determine whether to apply lazy operation. Default: False.
+ """
+ _directions = ['horizontal', 'vertical']
+
+ def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):
+ if direction not in self._directions:
+ raise ValueError(f'Direction {direction} is not supported. '
+ f'Currently support ones are {self._directions}')
+ self.flip_ratio = flip_ratio
+ self.direction = direction
+ self.lazy = lazy
+
+ def __call__(self, results):
+ """Performs the Flip augmentation.
+
+ Args:
+ results (dict): The resulting dict to be modified and passed
+ to the next transform in pipeline.
+ """
+ _init_lazy_if_proper(results, self.lazy)
+ flip = np.random.rand() < self.flip_ratio
+
+ results['flip'] = flip
+ results['flip_direction'] = self.direction
+
+ if not self.lazy:
+ if flip:
+ for i, img in enumerate(results['imgs']):
+ imflip_(img, self.direction)
+ lt = len(results['imgs'])
+ else:
+ results['imgs'] = list(results['imgs'])
+ else:
+ lazyop = results['lazy']
+ if lazyop['flip']:
+ raise NotImplementedError('Use one Flip please')
+ lazyop['flip'] = flip
+ lazyop['flip_direction'] = self.direction
+
+ if 'gt_bboxes' in results and flip:
+ assert not self.lazy and self.direction == 'horizontal'
+ entity_box_flip = EntityBoxFlip(results['img_shape'])
+ results = entity_box_flip(results)
+
+ return results
+
+ def __repr__(self):
+ repr_str = (
+ f'{self.__class__.__name__}('
+ f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
+ f'lazy={self.lazy})')
+ return repr_str
+
+def imnormalize_(img, mean, std, to_rgb=True):
+ """Inplace normalize an image with mean and std.
+
+ Args:
+ img (ndarray): Image to be normalized.
+ mean (ndarray): The mean to be used for normalize.
+ std (ndarray): The std to be used for normalize.
+ to_rgb (bool): Whether to convert to rgb.
+
+ Returns:
+ ndarray: The normalized image.
+ """
+ # cv2 inplace normalization does not accept uint8
+ assert img.dtype != np.uint8
+ mean = np.float64(mean.reshape(1, -1))
+ stdinv = 1 / np.float64(std.reshape(1, -1))
+ if to_rgb:
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
+ cv2.subtract(img, mean, img) # inplace
+ cv2.multiply(img, stdinv, img) # inplace
+ return img
+
+@PIPELINES.register()
+class Normalize:
+ """Normalize images with the given mean and std value.
+
+ Required keys are "imgs", "img_shape", "modality", added or modified
+ keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional
+ keys "scale_factor" is required
+
+ Args:
+ mean (Sequence[float]): Mean values of different channels.
+ std (Sequence[float]): Std values of different channels.
+ to_bgr (bool): Whether to convert channels from RGB to BGR.
+ Default: False.
+ adjust_magnitude (bool): Indicate whether to adjust the flow magnitude
+ on 'scale_factor' when modality is 'Flow'. Default: False.
+ """
+
+ def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):
+ if not isinstance(mean, Sequence):
+ raise TypeError(
+ f'Mean must be list, tuple or np.ndarray, but got {type(mean)}'
+ )
+
+ if not isinstance(std, Sequence):
+ raise TypeError(
+ f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+ self.mean = np.array(mean, dtype=np.float32)
+ self.std = np.array(std, dtype=np.float32)
+ self.to_bgr = to_bgr
+ self.adjust_magnitude = adjust_magnitude
+
+ def __call__(self, results):
+ n = len(results['imgs'])
+ h, w, c = results['imgs'][0].shape
+ imgs = np.empty((n, h, w, c), dtype=np.float32)
+ for i, img in enumerate(results['imgs']):
+ imgs[i] = img
+
+ for img in imgs:
+ imnormalize_(img, self.mean, self.std, self.to_bgr)
+
+ results['imgs'] = imgs
+ results['img_norm_cfg'] = dict(
+ mean=self.mean, std=self.std, to_bgr=self.to_bgr)
+
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'mean={self.mean}, '
+ f'std={self.std}, '
+ f'to_bgr={self.to_bgr}, '
+ f'adjust_magnitude={self.adjust_magnitude})')
+ return repr_str
+
+
diff --git a/paddlevideo/loader/pipelines/compose.py b/paddlevideo/loader/pipelines/compose.py
new file mode 100644
index 0000000000000000000000000000000000000000..76eb4ed4d436f692a25081dbe8efe9a9b9a11102
--- /dev/null
+++ b/paddlevideo/loader/pipelines/compose.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+from ..registry import PIPELINES
+import traceback
+from ...utils import build
+from ...utils import get_logger
+
+
+@PIPELINES.register()
+class Compose(object):
+ """
+ Composes several pipelines(include decode func, sample func, and transforms) together.
+
+ Note: To deal with ```list``` type cfg temporaray, like:
+
+ transform:
+ - Crop: # A list
+ attribute: 10
+ - Resize: # A list
+ attribute: 20
+
+ every key of list will pass as the key name to build a module.
+ XXX: will be improved in the future.
+
+ Args:
+ pipelines (list): List of transforms to compose.
+ Returns:
+ A compose object which is callable, __call__ for this Compose
+ object will call each given :attr:`transforms` sequencely.
+ """
+ def __init__(self, pipelines):
+ #assert isinstance(pipelines, Sequence)
+ self.pipelines = []
+ for p in pipelines.values():
+ if isinstance(p, dict):
+ p = build(p, PIPELINES)
+ self.pipelines.append(p)
+ elif isinstance(p, list):
+ for t in p:
+ #XXX: to deal with old format cfg, ugly code here!
+ temp_dict = dict(name=list(t.keys())[0])
+ for all_sub_t in t.values():
+ if all_sub_t is not None:
+ temp_dict.update(all_sub_t)
+
+ t = build(temp_dict, PIPELINES)
+ self.pipelines.append(t)
+ elif callable(p):
+ self.pipelines.append(p)
+ else:
+ raise TypeError(f'pipelines must be callable or a dict,'
+ f'but got {type(p)}')
+ def __call__(self, data):
+ for p in self.pipelines:
+ try:
+ data = p(data)
+ except Exception as e:
+ stack_info = traceback.format_exc()
+ logger = get_logger("paddlevideo")
+ logger.info("fail to perform transform [{}] with error: "
+ "{} and stack:\n{}".format(p, e, str(stack_info)))
+ raise e
+ return data
diff --git a/paddlevideo/loader/pipelines/decode.py b/paddlevideo/loader/pipelines/decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..478ea04e513859c939ba16a3b93292f9895bf08a
--- /dev/null
+++ b/paddlevideo/loader/pipelines/decode.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import av
+import cv2
+import pickle
+import decord as de
+import math
+import random
+from ..registry import PIPELINES
+
+
+def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
+ delta = max(video_size - clip_size, 0)
+ if clip_idx == -1: # here
+ # Random temporal sampling.
+ start_idx = random.uniform(0, delta)
+ else: # ignore
+ # Uniformly sample the clip with the given index.
+ start_idx = delta * clip_idx / num_clips
+ end_idx = start_idx + clip_size - 1
+ return start_idx, end_idx
+
+
+@PIPELINES.register()
+class VideoDecoder(object):
+ """
+ Decode mp4 file to frames.
+ Args:
+ filepath: the file path of mp4 file
+ """
+ def __init__(self,
+ backend='cv2',
+ mode='train',
+ sampling_rate=32,
+ num_seg=8,
+ num_clips=1,
+ target_fps=30):
+
+ self.backend = backend
+ # params below only for TimeSformer
+ self.mode = mode
+ self.sampling_rate = sampling_rate
+ self.num_seg = num_seg
+ self.num_clips = num_clips
+ self.target_fps = target_fps
+
+ def __call__(self, results):
+ """
+ Perform mp4 decode operations.
+ return:
+ List where each item is a numpy array after decoder.
+ """
+ file_path = results['filename']
+ results['format'] = 'video'
+ results['backend'] = self.backend
+
+ if self.backend == 'cv2':
+ cap = cv2.VideoCapture(file_path)
+ videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+ sampledFrames = []
+ for i in range(videolen):
+ ret, frame = cap.read()
+ # maybe first frame is empty
+ if ret == False:
+ continue
+ img = frame[:, :, ::-1]
+ sampledFrames.append(img)
+ results['frames'] = sampledFrames
+ results['frames_len'] = len(sampledFrames)
+
+ elif self.backend == 'decord':
+ container = de.VideoReader(file_path)
+ frames_len = len(container)
+ results['frames'] = container
+ results['frames_len'] = frames_len
+
+ elif self.backend == 'pyav': # for TimeSformer
+ if self.mode in ["train", "valid"]:
+ clip_idx = -1
+ elif self.mode in ["test"]:
+ clip_idx = 0
+ else:
+ raise NotImplementedError
+
+ container = av.open(file_path)
+
+ num_clips = 1 # always be 1
+
+ # decode process
+ fps = float(container.streams.video[0].average_rate)
+
+ frames_length = container.streams.video[0].frames
+ duration = container.streams.video[0].duration
+
+ if duration is None:
+ # If failed to fetch the decoding information, decode the entire video.
+ decode_all_video = True
+ video_start_pts, video_end_pts = 0, math.inf
+ else:
+ decode_all_video = False
+ start_idx, end_idx = get_start_end_idx(
+ frames_length,
+ self.sampling_rate * self.num_seg / self.target_fps * fps,
+ clip_idx, num_clips)
+ timebase = duration / frames_length
+ video_start_pts = int(start_idx * timebase)
+ video_end_pts = int(end_idx * timebase)
+
+ frames = None
+ # If video stream was found, fetch video frames from the video.
+ if container.streams.video:
+ margin = 1024
+ seek_offset = max(video_start_pts - margin, 0)
+
+ container.seek(seek_offset,
+ any_frame=False,
+ backward=True,
+ stream=container.streams.video[0])
+ tmp_frames = {}
+ buffer_count = 0
+ max_pts = 0
+ for frame in container.decode(**{"video": 0}):
+ max_pts = max(max_pts, frame.pts)
+ if frame.pts < video_start_pts:
+ continue
+ if frame.pts <= video_end_pts:
+ tmp_frames[frame.pts] = frame
+ else:
+ buffer_count += 1
+ tmp_frames[frame.pts] = frame
+ if buffer_count >= 0:
+ break
+ video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
+
+ container.close()
+
+ frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+ clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
+
+ start_idx, end_idx = get_start_end_idx(
+ len(frames), # frame_len
+ clip_sz,
+ clip_idx if decode_all_video else
+ 0, # If decode all video, -1 in train and valid, 0 in test;
+ # else, always 0 in train, valid and test, as we has selected clip size frames when decode.
+ 1)
+ results['frames'] = frames
+ results['frames_len'] = len(frames)
+ results['start_idx'] = start_idx
+ results['end_idx'] = end_idx
+ else:
+ raise NotImplementedError
+ return results
+
+
+@PIPELINES.register()
+class FrameDecoder(object):
+ """just parse results
+ """
+ def __init__(self):
+ pass
+
+ def __call__(self, results):
+ results['format'] = 'frame'
+ return results
+
+
+@PIPELINES.register()
+class MRIDecoder(object):
+ """just parse results
+ """
+ def __init__(self):
+ pass
+
+ def __call__(self, results):
+ results['format'] = 'MRI'
+ return results
+
+
+@PIPELINES.register()
+class FeatureDecoder(object):
+ """
+ Perform feature decode operations.e.g.youtube8m
+ """
+ def __init__(self, num_classes, max_len=512, has_label=True):
+ self.max_len = max_len
+ self.num_classes = num_classes
+ self.has_label = has_label
+
+ def __call__(self, results):
+ """
+ Perform feature decode operations.
+ return:
+ List where each item is a numpy array after decoder.
+ """
+ #1. load pkl
+ #2. parse to rgb/audio/
+ #3. padding
+
+ filepath = results['filename']
+ data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+ record = data
+ nframes = record['nframes'] if 'nframes' in record else record[
+ b'nframes']
+ rgb = record['feature'].astype(
+ float) if 'feature' in record else record[b'feature'].astype(float)
+ audio = record['audio'].astype(
+ float) if 'audio' in record else record[b'audio'].astype(float)
+ if self.has_label:
+ label = record['label'] if 'label' in record else record[b'label']
+ one_hot_label = self.make_one_hot(label, self.num_classes)
+
+ rgb = rgb[0:nframes, :]
+ audio = audio[0:nframes, :]
+
+ rgb = self.dequantize(rgb,
+ max_quantized_value=2.,
+ min_quantized_value=-2.)
+ audio = self.dequantize(audio,
+ max_quantized_value=2,
+ min_quantized_value=-2)
+
+ if self.has_label:
+ results['labels'] = one_hot_label.astype("float32")
+
+ feat_pad_list = []
+ feat_len_list = []
+ mask_list = []
+ vitem = [rgb, audio]
+ for vi in range(2): #rgb and audio
+ if vi == 0:
+ prefix = "rgb_"
+ else:
+ prefix = "audio_"
+ feat = vitem[vi]
+ results[prefix + 'len'] = feat.shape[0]
+ #feat pad step 1. padding
+ feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+ dtype=np.float32)
+ feat_pad = np.concatenate((feat, feat_add), axis=0)
+ results[prefix + 'data'] = feat_pad.astype("float32")
+ #feat pad step 2. mask
+ feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+ feat_mask_add = feat_add
+ feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
+ axis=0)
+ results[prefix + 'mask'] = feat_mask.astype("float32")
+
+ return results
+
+ def dequantize(self,
+ feat_vector,
+ max_quantized_value=2.,
+ min_quantized_value=-2.):
+ """
+ Dequantize the feature from the byte format to the float format
+ """
+
+ assert max_quantized_value > min_quantized_value
+ quantized_range = max_quantized_value - min_quantized_value
+ scalar = quantized_range / 255.0
+ bias = (quantized_range / 512.0) + min_quantized_value
+
+ return feat_vector * scalar + bias
+
+ def make_one_hot(self, label, dim=3862):
+ one_hot_label = np.zeros(dim)
+ one_hot_label = one_hot_label.astype(float)
+ for ind in label:
+ one_hot_label[int(ind)] = 1
+ return one_hot_label
diff --git a/paddlevideo/loader/pipelines/decode_image.py b/paddlevideo/loader/pipelines/decode_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cc01e9039c24f5fd89419669c0ea5dc8375fa36
--- /dev/null
+++ b/paddlevideo/loader/pipelines/decode_image.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import PIL.Image as pil
+
+try:
+ import skimage.transform
+except ImportError as e:
+ print(
+ f"{e}, [scikit-image] package and it's dependencies is required for ADDS."
+ )
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class ImageDecoder(object):
+ """Decode Image
+ """
+ def __init__(self,
+ dataset,
+ frame_idxs,
+ num_scales,
+ side_map,
+ full_res_shape,
+ img_ext,
+ backend='cv2'):
+ self.backend = backend
+ self.dataset = dataset
+ self.frame_idxs = frame_idxs
+ self.num_scales = num_scales
+ self.side_map = side_map
+ self.full_res_shape = full_res_shape
+ self.img_ext = img_ext
+
+ def _pil_loader(self, path):
+ with open(path, 'rb') as f:
+ with Image.open(f) as img:
+ return img.convert('RGB')
+
+ def get_color(self, folder, frame_index, side):
+ color = self._pil_loader(
+ self.get_image_path(self.dataset, folder, frame_index, side))
+ return color
+
+ def get_image_path(self, dataset, folder, frame_index, side):
+ if dataset == "kitti":
+ f_str = "{:010d}{}".format(frame_index, self.img_ext)
+ image_path = os.path.join(self.data_path, folder, f_str)
+ elif dataset == "kitti_odom":
+ f_str = "{:06d}{}".format(frame_index, self.img_ext)
+ image_path = os.path.join(self.data_path,
+ "sequences/{:02d}".format(int(folder)),
+ "image_{}".format(self.side_map[side]),
+ f_str)
+ elif dataset == "kitti_depth":
+ f_str = "{:010d}{}".format(frame_index, self.img_ext)
+ image_path = os.path.join(
+ self.data_path, folder,
+ "image_0{}/data".format(self.side_map[side]), f_str)
+
+ return image_path
+
+ def get_depth(self, dataset, folder, frame_index, side):
+ if dataset == "kitii_depth":
+ f_str = "{:010d}.png".format(frame_index)
+ depth_path = os.path.join(
+ self.data_path, folder,
+ "proj_depth/groundtruth/image_0{}".format(self.side_map[side]),
+ f_str)
+
+ depth_gt = pil.open(depth_path)
+ depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)
+ depth_gt = np.array(depth_gt).astype(np.float32) / 256
+
+ else:
+ f_str = "{:010d}{}".format(frame_index, self.img_ext)
+ depth_path = os.path.join(self.data_path, folder + '_gt', f_str)
+
+ img_file = Image.open(depth_path)
+ depth_png = np.array(img_file, dtype=int)
+ img_file.close()
+ # make sure we have a proper 16bit depth map here.. not 8bit!
+ assert np.max(depth_png) > 255, \
+ "np.max(depth_png)={}, path={}".format(np.max(depth_png), depth_path)
+
+ depth_gt = depth_png.astype(np.float) / 256.
+
+ depth_gt = depth_gt[160:960 - 160, :]
+
+ depth_gt = skimage.transform.resize(depth_gt,
+ self.full_res_shape[::-1],
+ order=0,
+ preserve_range=True,
+ mode='constant')
+
+ return depth_gt
+
+ def __call__(self, results):
+ """
+ Perform mp4 decode operations.
+ return:
+ List where each item is a numpy array after decoder.
+ """
+ if results.get('mode', None) == 'infer':
+ imgs = {}
+ imgs[("color", 0,
+ -1)] = Image.open(results["filename"]).convert("RGB")
+ results['imgs'] = imgs
+ return results
+
+ self.data_path = results['data_path']
+ results['backend'] = self.backend
+
+ imgs = {}
+
+ results['frame_idxs'] = self.frame_idxs
+ results['num_scales'] = self.num_scales
+
+ file_name = results['filename']
+ folder = results['folder']
+ frame_index = results['frame_index']
+ line = file_name.split('/')
+ istrain = folder.split('_')[1]
+ if 'mode' not in results:
+ results['mode'] = istrain
+ results['day_or_night'] = folder.split('_')[0]
+
+ if istrain == "train":
+ if folder[0] == 'd':
+ folder2 = folder + '_fake_night'
+ flag = 0
+ else:
+ folder2 = folder + '_fake_day'
+ tmp = folder
+ folder = folder2
+ folder2 = tmp
+ flag = 1
+
+ if len(line) == 3:
+ side = line[2]
+ else:
+ side = None
+
+ results['side'] = side
+
+ for i in self.frame_idxs:
+
+ if i == "s":
+ other_side = {"r": "l", "l": "r"}[side]
+ imgs[("color", i,
+ -1)] = self.get_color(folder, frame_index, other_side)
+ imgs[("color_n", i,
+ -1)] = self.get_color(folder2, frame_index,
+ other_side)
+ else:
+ imgs[("color", i,
+ -1)] = self.get_color(folder, frame_index + i, side)
+ imgs[("color_n", i,
+ -1)] = self.get_color(folder2, frame_index + i, side)
+
+ istrain = folder.split('_')[1]
+ if istrain != 'train':
+ if flag:
+ depth_gt = self.get_depth(folder2, frame_index, side)
+ else:
+ depth_gt = self.get_depth(folder, frame_index, side)
+ imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+ elif istrain == 'val':
+ if len(line) == 3:
+ side = line[2]
+ else:
+ side = None
+
+ for i in self.frame_idxs:
+ if i == "s":
+ other_side = {"r": "l", "l": "r"}[side]
+ imgs[("color", i,
+ -1)] = self.get_color(folder, frame_index, other_side)
+ else:
+
+ imgs[("color", i,
+ -1)] = self.get_color(folder, frame_index + i, side)
+
+ # adjusting intrinsics to match each scale in the pyramid
+
+ depth_gt = self.get_depth(self.dataset, folder, frame_index, side)
+ imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+ results['imgs'] = imgs
+
+ return results
diff --git a/paddlevideo/loader/pipelines/decode_sampler.py b/paddlevideo/loader/pipelines/decode_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f8f8743d1a06213e37210a5703b4d0a6b501c75
--- /dev/null
+++ b/paddlevideo/loader/pipelines/decode_sampler.py
@@ -0,0 +1,93 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+import decord as de
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class DecodeSampler(object):
+ """
+ We use 'decord' for decode and sampling, which is faster than opencv.
+ This is used in slowfast model.
+ Args:
+ num_frames(int): the number of frames we want to sample.
+ sampling_rate(int): sampling rate for video data.
+ target_fps(int): desired fps, default 30
+ test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.
+ """
+ def __init__(self,
+ num_frames,
+ sampling_rate,
+ default_sampling_rate=2,
+ target_fps=30,
+ test_mode=False):
+ self.num_frames = num_frames
+ self.orig_sampling_rate = self.sampling_rate = sampling_rate
+ self.default_sampling_rate = default_sampling_rate
+ self.target_fps = target_fps
+ self.test_mode = test_mode
+
+ def get_start_end_idx(self, video_size, clip_size, clip_idx,
+ temporal_num_clips):
+ delta = max(video_size - clip_size, 0)
+ if not self.test_mode:
+ # Random temporal sampling.
+ start_idx = random.uniform(0, delta)
+ else:
+ # Uniformly sample the clip with the given index.
+ start_idx = delta * clip_idx / temporal_num_clips
+ end_idx = start_idx + clip_size - 1
+ return start_idx, end_idx
+
+ def __call__(self, results):
+ """
+ Perform mp4 decode operations.
+ return:
+ List where each item is a numpy array after decoder.
+ """
+ short_cycle_idx = results.get('short_cycle_idx')
+ if short_cycle_idx:
+ self.sampling_rate = random.randint(self.default_sampling_rate,
+ self.orig_sampling_rate)
+
+ filepath = results['filename']
+ temporal_sample_index = results['temporal_sample_index']
+ temporal_num_clips = results['temporal_num_clips']
+
+ vr = de.VideoReader(filepath)
+ videolen = len(vr)
+
+ fps = vr.get_avg_fps()
+ clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps
+
+ start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,
+ temporal_sample_index,
+ temporal_num_clips)
+ index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64")
+ index = np.clip(index, 0, videolen)
+
+ frames_select = vr.get_batch(index) #1 for buffer
+
+ # dearray_to_img
+ np_frames = frames_select.asnumpy()
+ frames_select_list = []
+ for i in range(np_frames.shape[0]):
+ imgbuf = np_frames[i]
+ frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))
+ results['imgs'] = frames_select_list
+ return results
diff --git a/paddlevideo/loader/pipelines/decode_sampler_MRI.py b/paddlevideo/loader/pipelines/decode_sampler_MRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f64514e559bb45cf0241e243acb21865845b18
--- /dev/null
+++ b/paddlevideo/loader/pipelines/decode_sampler_MRI.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+ import SimpleITK as sitk
+except ImportError as e:
+ print(
+ f"{e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+ )
+import cv2
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class SFMRI_DecodeSampler(object):
+ """
+ Sample frames id.
+ NOTE: Use PIL to read image here, has diff with CV2
+ Args:
+ num_seg(int): number of segments.
+ seg_len(int): number of sampled frames in each segment.
+ valid_mode(bool): True or False.
+ select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+ Returns:
+ frames_idx: the index of sampled #frames.
+ """
+ def __init__(self,
+ num_seg,
+ seg_len,
+ valid_mode=False,
+ select_left=False,
+ dense_sample=False,
+ linspace_sample=False):
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.valid_mode = valid_mode
+ self.select_left = select_left
+ self.dense_sample = dense_sample
+ self.linspace_sample = linspace_sample
+
+ def _get(self, frames_idx_s, frames_idx_f, results):
+
+ frame_dir = results['frame_dir']
+ imgs_s = []
+ imgs_f = []
+ MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+ for idx in frames_idx_s:
+ item = MRI[idx]
+ item = cv2.resize(item, (224, 224))
+ imgs_s.append(item)
+
+ for idx in frames_idx_f:
+ item = MRI[idx]
+ item = cv2.resize(item, (224, 224))
+ imgs_f.append(item)
+
+ results['imgs'] = [imgs_s, imgs_f]
+ return results
+
+ def __call__(self, results):
+ """
+ Args:
+ frames_len: length of frames.
+ return:
+ sampling id.
+ """
+ frames_len = int(results['frames_len'])
+ average_dur1 = int(frames_len / self.num_seg[0])
+ average_dur2 = int(frames_len / self.num_seg[1])
+ frames_idx_s = []
+ frames_idx_f = []
+ if self.linspace_sample:
+ if 'start_idx' in results and 'end_idx' in results:
+ offsets_s = np.linspace(results['start_idx'],
+ results['end_idx'], self.num_seg[0])
+ offsets_f = np.linspace(results['start_idx'],
+ results['end_idx'], self.num_seg[1])
+ else:
+ offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])
+ offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])
+ offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)
+ offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)
+
+ frames_idx_s = list(offsets_s)
+ frames_idx_f = list(offsets_f)
+
+ return self._get(frames_idx_s, frames_idx_f, results)
+
+ if not self.select_left:
+ if self.dense_sample: # For ppTSM
+ if not self.valid_mode: # train
+ sample_pos = max(1, 1 + frames_len - 64)
+ t_stride1 = 64 // self.num_seg[0]
+ t_stride2 = 64 // self.num_seg[1]
+ start_idx = 0 if sample_pos == 1 else np.random.randint(
+ 0, sample_pos - 1)
+ offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1
+ for idx in range(self.num_seg[0])]
+ offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1
+ for idx in range(self.num_seg[1])]
+ frames_idx_s = offsets_s
+ frames_idx_f = offsets_f
+ else:
+ sample_pos = max(1, 1 + frames_len - 64)
+ t_stride1 = 64 // self.num_seg[0]
+ t_stride2 = 64 // self.num_seg[1]
+ start_list = np.linspace(0,
+ sample_pos - 1,
+ num=10,
+ dtype=int)
+ offsets_s = []
+ offsets_f = []
+ for start_idx in start_list.tolist():
+ offsets_s += [
+ (idx * t_stride1 + start_idx) % frames_len + 1
+ for idx in range(self.num_seg[0])
+ ]
+ for start_idx in start_list.tolist():
+ offsets_f += [
+ (idx * t_stride2 + start_idx) % frames_len + 1
+ for idx in range(self.num_seg[1])
+ ]
+ frames_idx_s = offsets_s
+ frames_idx_f = offsets_f
+ else:
+ for i in range(self.num_seg[0]):
+ idx = 0
+ if not self.valid_mode:
+ if average_dur1 >= self.seg_len:
+ idx = random.randint(0, average_dur1 - self.seg_len)
+ idx += i * average_dur1
+ elif average_dur1 >= 1:
+ idx += i * average_dur1
+ else:
+ idx = i
+ else:
+ if average_dur1 >= self.seg_len:
+ idx = (average_dur1 - 1) // 2
+ idx += i * average_dur1
+ elif average_dur1 >= 1:
+ idx += i * average_dur1
+ else:
+ idx = i
+ for jj in range(idx, idx + self.seg_len):
+ frames_idx_s.append(jj)
+
+ for i in range(self.num_seg[1]):
+ idx = 0
+ if not self.valid_mode:
+ if average_dur2 >= self.seg_len:
+ idx = random.randint(0, average_dur2 - self.seg_len)
+ idx += i * average_dur2
+ elif average_dur2 >= 1:
+ idx += i * average_dur2
+ else:
+ idx = i
+ else:
+ if average_dur2 >= self.seg_len:
+ idx = (average_dur2 - 1) // 2
+ idx += i * average_dur2
+ elif average_dur2 >= 1:
+ idx += i * average_dur2
+ else:
+ idx = i
+ for jj in range(idx, idx + self.seg_len):
+ frames_idx_f.append(jj)
+
+ return self._get(frames_idx_s, frames_idx_f, results)
+
+ else: # for TSM
+ if not self.valid_mode:
+ if average_dur2 > 0:
+ offsets_s = np.multiply(list(range(
+ self.num_seg[0])), average_dur1) + np.random.randint(
+ average_dur1, size=self.num_seg[0])
+
+ offsets_f = np.multiply(list(range(
+ self.num_seg[1])), average_dur2) + np.random.randint(
+ average_dur2, size=self.num_seg[1])
+ elif frames_len > self.num_seg[1]:
+ offsets_s = np.sort(
+ np.random.randint(frames_len, size=self.num_seg[0]))
+ offsets_f = np.sort(
+ np.random.randint(frames_len, size=self.num_seg[1]))
+ else:
+ offsets_s = np.zeros(shape=(self.num_seg[0], ))
+ offsets_f = np.zeros(shape=(self.num_seg[1], ))
+ else:
+ if frames_len > self.num_seg[1]:
+ average_dur_float_s = frames_len / self.num_seg[0]
+ offsets_s = np.array([
+ int(average_dur_float_s / 2.0 + average_dur_float_s * x)
+ for x in range(self.num_seg[0])
+ ])
+ average_dur_float_f = frames_len / self.num_seg[1]
+ offsets_f = np.array([
+ int(average_dur_float_f / 2.0 + average_dur_float_f * x)
+ for x in range(self.num_seg[1])
+ ])
+ else:
+ offsets_s = np.zeros(shape=(self.num_seg[0], ))
+ offsets_f = np.zeros(shape=(self.num_seg[1], ))
+
+ frames_idx_s = list(offsets_s)
+ frames_idx_f = list(offsets_f)
+
+ return self._get(frames_idx_s, frames_idx_f, results)
diff --git a/paddlevideo/loader/pipelines/mix.py b/paddlevideo/loader/pipelines/mix.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc5f98cf0e6e3ead3848a47ea76c8ceb478f2f0
--- /dev/null
+++ b/paddlevideo/loader/pipelines/mix.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Mixup(object):
+ """
+ Mixup operator.
+ Args:
+ alpha(float): alpha value.
+ """
+ def __init__(self, alpha=0.2):
+ assert alpha > 0., \
+ 'parameter alpha[%f] should > 0.0' % (alpha)
+ self.alpha = alpha
+
+ def __call__(self, batch):
+ imgs, labels = list(zip(*batch))
+ imgs = np.array(imgs)
+ labels = np.array(labels)
+ bs = len(batch)
+ idx = np.random.permutation(bs)
+ lam = np.random.beta(self.alpha, self.alpha)
+ lams = np.array([lam] * bs, dtype=np.float32)
+ imgs = lam * imgs + (1 - lam) * imgs[idx]
+ return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class Cutmix(object):
+ """ Cutmix operator
+ Args:
+ alpha(float): alpha value.
+ """
+ def __init__(self, alpha=0.2):
+ assert alpha > 0., \
+ 'parameter alpha[%f] should > 0.0' % (alpha)
+ self.alpha = alpha
+
+ def rand_bbox(self, size, lam):
+ """ rand_bbox """
+ w = size[2]
+ h = size[3]
+ cut_rat = np.sqrt(1. - lam)
+ cut_w = np.int(w * cut_rat)
+ cut_h = np.int(h * cut_rat)
+
+ # uniform
+ cx = np.random.randint(w)
+ cy = np.random.randint(h)
+
+ bbx1 = np.clip(cx - cut_w // 2, 0, w)
+ bby1 = np.clip(cy - cut_h // 2, 0, h)
+ bbx2 = np.clip(cx + cut_w // 2, 0, w)
+ bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+ return bbx1, bby1, bbx2, bby2
+
+ def __call__(self, batch):
+ imgs, labels = list(zip(*batch))
+ imgs = np.array(imgs)
+ labels = np.array(labels)
+
+ bs = len(batch)
+ idx = np.random.permutation(bs)
+ lam = np.random.beta(self.alpha, self.alpha)
+
+ bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)
+ imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+ lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+ (imgs.shape[-2] * imgs.shape[-1]))
+ lams = np.array([lam] * bs, dtype=np.float32)
+
+ return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class VideoMix(object):
+ """
+ VideoMix operator.
+ Args:
+ cutmix_prob(float): prob choose cutmix
+ mixup_alpha(float): alpha for mixup aug
+ cutmix_alpha(float): alpha for cutmix aug
+ """
+ def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):
+ assert cutmix_prob > 0., \
+ 'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)
+ assert mixup_alpha > 0., \
+ 'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)
+ assert cutmix_alpha > 0., \
+ 'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)
+ self.cutmix_prob = cutmix_prob
+ self.mixup = Mixup(mixup_alpha)
+ self.cutmix = Cutmix(cutmix_alpha)
+
+ def __call__(self, batch):
+ if np.random.random() < self.cutmix_prob:
+ return self.cutmix(batch)
+ else:
+ return self.mixup(batch)
diff --git a/paddlevideo/loader/pipelines/multimodal.py b/paddlevideo/loader/pipelines/multimodal.py
new file mode 100644
index 0000000000000000000000000000000000000000..f00f68bea297f5bfe49b4d2b5baaaae039dbf79f
--- /dev/null
+++ b/paddlevideo/loader/pipelines/multimodal.py
@@ -0,0 +1,380 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+import decord as de
+import copy
+import json
+from ..registry import PIPELINES
+
+try:
+ from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+ print(
+ f"{e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+ )
+
+
+@PIPELINES.register()
+class FeaturePadding(object):
+ """
+ Padding feature to target shape.
+ """
+ def __init__(self, max_region_num=36, max_action_num=5):
+ self.max_region_num = max_region_num
+ self.max_action_num = max_action_num
+
+ def __call__(self, results):
+ """
+ Padding feature.
+ """
+ pack_feature = results['feature']
+ tokenizer = results['tokenizer']
+ image_feature_wp, image_target_wp, image_location_wp, \
+ num_boxes, image_h, image_w, image_id, caption, \
+ action_feature_wp, action_target_wp, num_actions = pack_feature
+
+ image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
+ image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
+ image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
+
+ action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
+ action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
+
+ num_boxes = int(num_boxes)
+ image_feature[:num_boxes] = image_feature_wp
+ image_target[:num_boxes] = image_target_wp
+ image_location[:num_boxes, :4] = image_location_wp
+
+ image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
+ image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
+ float(image_h))
+
+ image_location[:, 0] = image_location[:, 0] / float(image_w)
+ image_location[:, 1] = image_location[:, 1] / float(image_h)
+ image_location[:, 2] = image_location[:, 2] / float(image_w)
+ image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+ image_feature = copy.deepcopy(image_feature)
+ image_target = copy.deepcopy(image_target)
+
+ num_actions = int(num_actions)
+ action_feature[:num_actions] = action_feature_wp
+ action_target[:num_actions] = action_target_wp
+ action_feature = copy.deepcopy(action_feature)
+ action_target = copy.deepcopy(action_target)
+
+ results = dict(image_feat=image_feature,
+ image_target=image_target,
+ caption=caption,
+ image_loc=image_location,
+ num_boxes=int(num_boxes),
+ action_feat=action_feature,
+ action_target=action_target,
+ num_actions=int(num_actions),
+ tokenizer=tokenizer)
+ return results
+
+
+@PIPELINES.register()
+class RandomCap(object):
+ def __init__(self, caption_path):
+ """
+ Random Caption for NSP task
+ """
+ self.caption_path = caption_path
+
+ def select_caption(self, caption):
+ captions = caption.split('!')
+ rind = random.randint(0, len(captions) - 1)
+ caption = captions[rind]
+ return caption
+
+ def get_random_caption(self, all_captions):
+ num_caps = len(all_captions)
+ rand_doc_idx = random.randint(0, num_caps - 1)
+ caption = all_captions[rand_doc_idx]
+ caption = self.select_caption(caption)
+ return caption
+
+ def random_cap(self, caption, all_captions):
+ if random.random() > 0.5:
+ label = 0
+ else:
+ caption = self.get_random_caption(all_captions)
+ label = 1
+ return caption, label
+
+ def __call__(self, results):
+ caption = results['caption']
+ all_captions = list(json.load(open(self.caption_path, 'r')))
+ caption = self.select_caption(caption)
+ caption, label = self.random_cap(caption, all_captions)
+ results['caption'] = caption
+ results['is_next'] = label
+ return results
+
+
+@PIPELINES.register()
+class Tokenize(object):
+ def __init__(self, ):
+ """
+ Tokenize caption
+ """
+ pass
+
+ def __call__(self, results):
+ caption = results['caption']
+ tokenizer = results['tokenizer']
+ tokens_caption = tokenizer.tokenize(caption)
+ results['caption'] = tokens_caption
+ return results
+
+
+@PIPELINES.register()
+class RandomMask(object):
+ def __init__(self,
+ max_seq_length=36,
+ max_action_length=5,
+ max_region_length=36):
+ self.max_seq_length = max_seq_length
+ self.max_action_length = max_action_length
+ self.max_region_length = max_region_length
+
+ def get_image_global_feature(self, image_feat, image_loc, image_mask):
+ g_image_feat = np.sum(image_feat, axis=0) / np.sum(
+ image_mask, axis=0, keepdims=True)
+ image_feat = np.concatenate(
+ [np.expand_dims(g_image_feat, axis=0), image_feat],
+ axis=0).astype("float32")
+
+ g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
+ image_loc = np.concatenate(
+ [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
+
+ g_image_mask = np.array([1])
+ image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
+
+ return image_feat, image_loc, image_mask
+
+ def _truncate_seq_pair(self, tokens_b, max_length):
+ """Truncates a sequence pair in place to the maximum length.
+ This is a simple heuristic which will always truncate the longer sequence
+ one token at a time. This makes more sense than truncating an equal percent
+ of tokens from each, since if one sequence is very short then each token
+ that's truncated likely contains more information than a longer sequence.
+ """
+ while True:
+ total_length = len(tokens_b)
+ if total_length <= max_length:
+ break
+ tokens_b.pop()
+
+ def random_word(self, tokens, tokenizer):
+ """
+ Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
+ Args:
+ tokens: list of str, tokenized sentence.
+ tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
+ Return:
+ (list of str, list of int), masked tokens and related labels for LM prediction
+ """
+ output_label = []
+
+ for i, token in enumerate(tokens):
+ prob = random.random()
+ # mask token with 15% probability
+
+ if prob < 0.15:
+ prob /= 0.15
+
+ # 80% randomly change token to mask token
+ if prob < 0.8:
+ tokens[i] = "[MASK]"
+
+ # 10% randomly change token to random token
+ elif prob < 0.9:
+ #tok = random.choice(list(tokenizer.vocab.items()))[0]
+ tok = tokenizer.vocab.idx_to_token[random.randint(
+ 0,
+ tokenizer.vocab_size,
+ )]
+ tokens[i] = tok
+
+ # rest 10% randomly keep current token
+ # append current token to output (we will predict these later)
+ try:
+ output_label.append(tokenizer.vocab[token])
+ except KeyError:
+ # For unknown words (should not occur with BPE vocab)
+ output_label.append(tokenizer.vocab["[UNK]"])
+ print(
+ "Cannot find token '{}' in vocab. Using [UNK] insetad".
+ format(token))
+ else:
+ # no masking token (will be ignored by loss function later)
+ output_label.append(-1)
+
+ return tokens, output_label
+
+ def random_region(self, image_feat, image_loc, num_boxes):
+ output_label = []
+
+ for i in range(num_boxes):
+ prob = random.random()
+ # mask token with 15% probability
+ if prob < 0.15:
+ prob /= 0.15
+
+ # 80% randomly change token to mask token
+ if prob < 0.9:
+ image_feat[i] = 0
+
+ # rest 20% randomly keep current token
+ # append current token to output (we will predict these later)
+ output_label.append(1)
+ else:
+ # no masking token (will be ignored by loss function later)
+ output_label.append(-1)
+
+ return image_feat, image_loc, output_label
+
+ def random_action(self, action_feat, action_target, num_actions):
+ output_label = []
+
+ for i in range(num_actions):
+ prob = random.random()
+ # mask token with 15% probability
+ if prob < 0.15:
+ prob /= 0.15
+
+ # 90% randomly change token to mask token
+ if prob < 0.9:
+ action_feat[i] = 0
+
+ # rest 10% randomly keep current token
+ # append current token to output (we will predict these later)
+ output_label.append(action_target[i])
+ else:
+ # no masking token (will be ignored by loss function later)
+ output_label.append(-1)
+
+ return action_feat, output_label
+
+ def __call__(self, results):
+ caption = results['caption']
+ tokenizer = results['tokenizer']
+ image_feat = results['image_feat']
+ image_loc = results['image_loc']
+ num_boxes = results['num_boxes']
+ action_feat = results['action_feat']
+ action_target = results['action_target']
+ num_actions = results['num_actions']
+ is_next = results['is_next']
+ image_target = results['image_target']
+
+ self._truncate_seq_pair(caption, self.max_seq_length - 2)
+ caption, caption_label = self.random_word(caption, tokenizer)
+
+ image_feat, image_loc, image_label = self.random_region(
+ image_feat, image_loc, num_boxes)
+ action_feat, action_label = self.random_action(action_feat,
+ action_target,
+ num_actions)
+
+ # concatenate lm labels and account for CLS, SEP, SEP
+ lm_label_ids = [-1] + caption_label + [-1]
+
+ # The convention in BERT is:
+ # (a) For sequence pairs:
+ # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+ # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ # (b) For single sequences:
+ # tokens: [CLS] the dog is hairy . [SEP]
+ # type_ids: 0 0 0 0 0 0 0
+ #
+ # Where "type_ids" are used to indicate whether this is the first
+ # sequence or the second sequence. The embedding vectors for `type=0` and
+ # `type=1` were learned during pre-training and are added to the wordpiece
+ # embedding vector (and position vector). This is not *strictly* necessary
+ # since the [SEP] token unambigiously separates the sequences, but it makes
+ # it easier for the model to learn the concept of sequences.
+ #
+ # For classification tasks, the first vector (corresponding to [CLS]) is
+ # used as as the "sentence vector". Note that this only makes sense because
+ # the entire model is fine-tuned.
+
+ tokens = []
+ segment_ids = []
+
+ tokens.append("[CLS]")
+ segment_ids.append(0)
+
+ for token in caption:
+ tokens.append(token)
+ segment_ids.append(0)
+ tokens.append("[SEP]")
+ segment_ids.append(0)
+ input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+ # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+ input_mask = [1] * (len(input_ids))
+ image_mask = [1] * (num_boxes)
+ action_mask = [1] * (num_actions)
+
+ # Zero-pad up to the visual sequence length.
+ while len(image_mask) < self.max_region_length:
+ image_mask.append(0)
+ image_label.append(-1)
+ while len(action_mask) < self.max_action_length:
+ action_mask.append(0)
+ action_label.append(-1)
+
+ # Zero-pad up to the sequence length.
+ while len(input_ids) < self.max_seq_length:
+ input_ids.append(0)
+ input_mask.append(0)
+ segment_ids.append(0)
+ lm_label_ids.append(-1)
+
+ assert len(input_ids) == self.max_seq_length
+ assert len(input_mask) == self.max_seq_length
+ assert len(segment_ids) == self.max_seq_length
+ assert len(lm_label_ids) == self.max_seq_length
+ assert len(image_mask) == self.max_region_length
+ assert len(image_label) == self.max_region_length
+ assert len(action_mask) == self.max_action_length
+ assert len(action_label) == self.max_action_length
+
+ image_feat, image_loc, image_mask = self.get_image_global_feature(
+ image_feat, image_loc, np.array(image_mask))
+ features = [
+ np.array(input_ids),
+ action_feat,
+ image_feat,
+ image_loc,
+ np.array(segment_ids),
+ np.array(input_mask),
+ image_mask,
+ np.array(action_mask),
+ np.array(lm_label_ids),
+ np.array(action_label),
+ np.array(is_next),
+ np.array(image_label),
+ image_target,
+ ]
+ results['features'] = features
+ return results
diff --git a/paddlevideo/loader/pipelines/sample.py b/paddlevideo/loader/pipelines/sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..9780ae333760d16830c7a8675826d32a6baaef02
--- /dev/null
+++ b/paddlevideo/loader/pipelines/sample.py
@@ -0,0 +1,380 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+ import SimpleITK as sitk
+except ImportError as e:
+ print(
+ f"{e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+ )
+import cv2
+
+from ..registry import PIPELINES
+
+try:
+ import cPickle as pickle
+ from cStringIO import StringIO
+except ImportError:
+ import pickle
+ from io import BytesIO
+
+
+@PIPELINES.register()
+class Sampler(object):
+ """
+ Sample frames id.
+ NOTE: Use PIL to read image here, has diff with CV2
+ Args:
+ num_seg(int): number of segments.
+ seg_len(int): number of sampled frames in each segment.
+ valid_mode(bool): True or False.
+ select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+ Returns:
+ frames_idx: the index of sampled #frames.
+ """
+ def __init__(self,
+ num_seg,
+ seg_len,
+ frame_interval=None,
+ valid_mode=False,
+ select_left=False,
+ dense_sample=False,
+ linspace_sample=False,
+ use_pil=True):
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.frame_interval = frame_interval
+ self.valid_mode = valid_mode
+ self.select_left = select_left
+ self.dense_sample = dense_sample
+ self.linspace_sample = linspace_sample
+ self.use_pil = use_pil
+
+ def _get(self, frames_idx, results):
+ data_format = results['format']
+
+ if data_format == "frame":
+ frame_dir = results['frame_dir']
+ imgs = []
+ for idx in frames_idx:
+ img = Image.open(
+ os.path.join(frame_dir,
+ results['suffix'].format(idx))).convert('RGB')
+ imgs.append(img)
+
+ elif data_format == "MRI":
+ frame_dir = results['frame_dir']
+ imgs = []
+ MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+ for idx in frames_idx:
+ item = MRI[idx]
+ item = cv2.resize(item, (224, 224))
+ imgs.append(item)
+
+ elif data_format == "video":
+ if results['backend'] == 'cv2':
+ frames = np.array(results['frames'])
+ imgs = []
+ for idx in frames_idx:
+ imgbuf = frames[idx]
+ img = Image.fromarray(imgbuf, mode='RGB')
+ imgs.append(img)
+ elif results['backend'] == 'decord':
+ container = results['frames']
+ if self.use_pil:
+ frames_select = container.get_batch(frames_idx)
+ # dearray_to_img
+ np_frames = frames_select.asnumpy()
+ imgs = []
+ for i in range(np_frames.shape[0]):
+ imgbuf = np_frames[i]
+ imgs.append(Image.fromarray(imgbuf, mode='RGB'))
+ else:
+ if frames_idx.ndim != 1:
+ frames_idx = np.squeeze(frames_idx)
+ frame_dict = {
+ idx: container[idx].asnumpy()
+ for idx in np.unique(frames_idx)
+ }
+ imgs = [frame_dict[idx] for idx in frames_idx]
+ elif results['backend'] == 'pyav':
+ imgs = []
+ frames = np.array(results['frames'])
+ for idx in frames_idx:
+ imgbuf = frames[idx]
+ imgs.append(imgbuf)
+ imgs = np.stack(imgs) # thwc
+ else:
+ raise NotImplementedError
+ else:
+ raise NotImplementedError
+ results['imgs'] = imgs
+ return results
+
+ def _get_train_clips(self, num_frames):
+ ori_seg_len = self.seg_len * self.frame_interval
+ avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
+
+ if avg_interval > 0:
+ base_offsets = np.arange(self.num_seg) * avg_interval
+ clip_offsets = base_offsets + np.random.randint(avg_interval,
+ size=self.num_seg)
+ elif num_frames > max(self.num_seg, ori_seg_len):
+ clip_offsets = np.sort(
+ np.random.randint(num_frames - ori_seg_len + 1,
+ size=self.num_seg))
+ elif avg_interval == 0:
+ ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
+ clip_offsets = np.around(np.arange(self.num_seg) * ratio)
+ else:
+ clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+ return clip_offsets
+
+ def _get_test_clips(self, num_frames):
+ ori_seg_len = self.seg_len * self.frame_interval
+ avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
+ if num_frames > ori_seg_len - 1:
+ base_offsets = np.arange(self.num_seg) * avg_interval
+ clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+ else:
+ clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+ return clip_offsets
+
+ def __call__(self, results):
+ """
+ Args:
+ frames_len: length of frames.
+ return:
+ sampling id.
+ """
+ frames_len = int(results['frames_len'])
+ frames_idx = []
+ if self.frame_interval is not None:
+ assert isinstance(self.frame_interval, int)
+ if not self.valid_mode:
+ offsets = self._get_train_clips(frames_len)
+ else:
+ offsets = self._get_test_clips(frames_len)
+
+ offsets = offsets[:, None] + np.arange(
+ self.seg_len)[None, :] * self.frame_interval
+ offsets = np.concatenate(offsets)
+
+ offsets = offsets.reshape((-1, self.seg_len))
+ offsets = np.mod(offsets, frames_len)
+ offsets = np.concatenate(offsets)
+
+ if results['format'] == 'video':
+ frames_idx = offsets
+ elif results['format'] == 'frame':
+ frames_idx = list(offsets + 1)
+ else:
+ raise NotImplementedError
+
+ return self._get(frames_idx, results)
+
+ if self.linspace_sample:
+ if 'start_idx' in results and 'end_idx' in results:
+ offsets = np.linspace(results['start_idx'], results['end_idx'],
+ self.num_seg)
+ else:
+ offsets = np.linspace(0, frames_len - 1, self.num_seg)
+ offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
+ if results['format'] == 'video':
+ frames_idx = list(offsets)
+ frames_idx = [x % frames_len for x in frames_idx]
+ elif results['format'] == 'frame':
+ frames_idx = list(offsets + 1)
+
+ elif results['format'] == 'MRI':
+ frames_idx = list(offsets)
+
+ else:
+ raise NotImplementedError
+ return self._get(frames_idx, results)
+
+ average_dur = int(frames_len / self.num_seg)
+ if not self.select_left:
+ if self.dense_sample: # For ppTSM
+ if not self.valid_mode: # train
+ sample_pos = max(1, 1 + frames_len - 64)
+ t_stride = 64 // self.num_seg
+ start_idx = 0 if sample_pos == 1 else np.random.randint(
+ 0, sample_pos - 1)
+ offsets = [(idx * t_stride + start_idx) % frames_len + 1
+ for idx in range(self.num_seg)]
+ frames_idx = offsets
+ else:
+ sample_pos = max(1, 1 + frames_len - 64)
+ t_stride = 64 // self.num_seg
+ start_list = np.linspace(0,
+ sample_pos - 1,
+ num=10,
+ dtype=int)
+ offsets = []
+ for start_idx in start_list.tolist():
+ offsets += [
+ (idx * t_stride + start_idx) % frames_len + 1
+ for idx in range(self.num_seg)
+ ]
+ frames_idx = offsets
+ else:
+ for i in range(self.num_seg):
+ idx = 0
+ if not self.valid_mode:
+ if average_dur >= self.seg_len:
+ idx = random.randint(0, average_dur - self.seg_len)
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
+ else:
+ idx = i
+ else:
+ if average_dur >= self.seg_len:
+ idx = (average_dur - 1) // 2
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
+ else:
+ idx = i
+ for jj in range(idx, idx + self.seg_len):
+ if results['format'] == 'video':
+ frames_idx.append(int(jj % frames_len))
+ elif results['format'] == 'frame':
+ frames_idx.append(jj + 1)
+
+ elif results['format'] == 'MRI':
+ frames_idx.append(jj)
+ else:
+ raise NotImplementedError
+ return self._get(frames_idx, results)
+
+ else: # for TSM
+ if not self.valid_mode:
+ if average_dur > 0:
+ offsets = np.multiply(list(range(self.num_seg)),
+ average_dur) + np.random.randint(
+ average_dur, size=self.num_seg)
+ elif frames_len > self.num_seg:
+ offsets = np.sort(
+ np.random.randint(frames_len, size=self.num_seg))
+ else:
+ offsets = np.zeros(shape=(self.num_seg, ))
+ else:
+ if frames_len > self.num_seg:
+ average_dur_float = frames_len / self.num_seg
+ offsets = np.array([
+ int(average_dur_float / 2.0 + average_dur_float * x)
+ for x in range(self.num_seg)
+ ])
+ else:
+ offsets = np.zeros(shape=(self.num_seg, ))
+
+ if results['format'] == 'video':
+ frames_idx = list(offsets)
+ frames_idx = [x % frames_len for x in frames_idx]
+ elif results['format'] == 'frame':
+ frames_idx = list(offsets + 1)
+
+ elif results['format'] == 'MRI':
+ frames_idx = list(offsets)
+
+ else:
+ raise NotImplementedError
+
+ return self._get(frames_idx, results)
+
+
+@PIPELINES.register()
+class SamplerPkl(object):
+ """
+ Sample frames id.
+ NOTE: Use PIL to read image here, has diff with CV2
+ Args:
+ num_seg(int): number of segments.
+ seg_len(int): number of sampled frames in each segment.
+ mode(str): 'train', 'valid'
+ Returns:
+ frames_idx: the index of sampled #frames.
+ """
+ def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.valid_mode = valid_mode
+ self.backend = backend
+
+ def _get(self, buf):
+ if isinstance(buf, str):
+ img = Image.open(StringIO(buf))
+ else:
+ img = Image.open(BytesIO(buf))
+ img = img.convert('RGB')
+ if self.backend != 'pillow':
+ img = np.array(img)
+ return img
+
+ def __call__(self, results):
+ """
+ Args:
+ frames_len: length of frames.
+ return:
+ sampling id.
+ """
+ filename = results['frame_dir']
+ data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')
+ video_name, label, frames = data_loaded
+ if isinstance(label, dict):
+ label = label['动作类型']
+ results['labels'] = label
+ elif len(label) == 1:
+ results['labels'] = int(label[0])
+ else:
+ results['labels'] = int(label[0]) if random.random() < 0.5 else int(
+ label[1])
+ results['frames_len'] = len(frames)
+ frames_len = results['frames_len']
+ average_dur = int(int(frames_len) / self.num_seg)
+ imgs = []
+ for i in range(self.num_seg):
+ idx = 0
+ if not self.valid_mode:
+ if average_dur >= self.seg_len:
+ idx = random.randint(0, average_dur - self.seg_len)
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
+ else:
+ idx = i
+ else:
+ if average_dur >= self.seg_len:
+ idx = (average_dur - 1) // 2
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
+ else:
+ idx = i
+
+ for jj in range(idx, idx + self.seg_len):
+ imgbuf = frames[int(jj % results['frames_len'])]
+ img = self._get(imgbuf)
+ imgs.append(img)
+ results['backend'] = self.backend
+ results['imgs'] = imgs
+
+ return results
diff --git a/paddlevideo/loader/pipelines/sample_ava.py b/paddlevideo/loader/pipelines/sample_ava.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e90a2166531a6db2a62c5bb33bd3d4ab1b3914
--- /dev/null
+++ b/paddlevideo/loader/pipelines/sample_ava.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from PIL import Image
+from ..registry import PIPELINES
+import os
+import numpy as np
+import io
+import os.path as osp
+from abc import ABCMeta, abstractmethod
+import cv2
+from cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED
+import inspect
+
+imread_backend = 'cv2'
+imread_flags = {
+ 'color': IMREAD_COLOR,
+ 'grayscale': IMREAD_GRAYSCALE,
+ 'unchanged': IMREAD_UNCHANGED
+}
+
+
+@PIPELINES.register()
+class SampleFrames:
+ """Sample frames from the video. """
+
+ def __init__(self,
+ clip_len,
+ frame_interval=1,
+ num_clips=1,
+ temporal_jitter=False,
+ twice_sample=False,
+ out_of_bound_opt='loop',
+ test_mode=False):
+ self.clip_len = clip_len
+ self.frame_interval = frame_interval
+ self.num_clips = num_clips
+ self.temporal_jitter = temporal_jitter
+ self.twice_sample = twice_sample
+ self.out_of_bound_opt = out_of_bound_opt
+ self.test_mode = test_mode
+ assert self.out_of_bound_opt in ['loop', 'repeat_last']
+
+ def _get_train_clips(self, num_frames):
+ """Get clip offsets in train mode. """
+ ori_clip_len = self.clip_len * self.frame_interval
+ avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
+ if avg_interval > 0:
+ base_offsets = np.arange(self.num_clips) * avg_interval
+ clip_offsets = base_offsets + np.random.randint(
+ avg_interval, size=self.num_clips)
+ elif num_frames > max(self.num_clips, ori_clip_len):
+ clip_offsets = np.sort(
+ np.random.randint(
+ num_frames - ori_clip_len + 1, size=self.num_clips))
+ elif avg_interval == 0:
+ ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
+ clip_offsets = np.around(np.arange(self.num_clips) * ratio)
+ else:
+ clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+ return clip_offsets
+
+ def _get_test_clips(self, num_frames):
+ """Get clip offsets in test mode. """
+ ori_clip_len = self.clip_len * self.frame_interval
+ avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
+ if num_frames > ori_clip_len - 1:
+ base_offsets = np.arange(self.num_clips) * avg_interval
+ clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+ if self.twice_sample:
+ clip_offsets = np.concatenate([clip_offsets, base_offsets])
+ else:
+ clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+ return clip_offsets
+
+ def _sample_clips(self, num_frames):
+ """Choose clip offsets for the video in a given mode. """
+ if self.test_mode:
+ clip_offsets = self._get_test_clips(num_frames)
+ else:
+ clip_offsets = self._get_train_clips(num_frames)
+ return clip_offsets
+
+ def __call__(self, results):
+ """Perform the SampleFrames loading. """
+ total_frames = results['total_frames']
+ clip_offsets = self._sample_clips(total_frames)
+ frame_inds = clip_offsets[:, None] + np.arange(
+ self.clip_len)[None, :] * self.frame_interval
+ frame_inds = np.concatenate(frame_inds)
+ if self.temporal_jitter:
+ perframe_offsets = np.random.randint(
+ self.frame_interval, size=len(frame_inds))
+ frame_inds += perframe_offsets
+ frame_inds = frame_inds.reshape((-1, self.clip_len))
+ if self.out_of_bound_opt == 'loop':
+ frame_inds = np.mod(frame_inds, total_frames)
+ elif self.out_of_bound_opt == 'repeat_last':
+ safe_inds = frame_inds < total_frames
+ unsafe_inds = 1 - safe_inds
+ last_ind = np.max(safe_inds * frame_inds, axis=1)
+ new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
+ frame_inds = new_inds
+ else:
+ raise ValueError('Illegal out_of_bound option.')
+ start_index = results['start_index']
+ frame_inds = np.concatenate(frame_inds) + start_index
+ results['frame_inds'] = frame_inds.astype(np.int)
+ results['clip_len'] = self.clip_len
+ results['frame_interval'] = self.frame_interval
+ results['num_clips'] = self.num_clips
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'clip_len={self.clip_len}, '
+ f'frame_interval={self.frame_interval}, '
+ f'num_clips={self.num_clips}, '
+ f'temporal_jitter={self.temporal_jitter}, '
+ f'twice_sample={self.twice_sample}, '
+ f'out_of_bound_opt={self.out_of_bound_opt}, '
+ f'test_mode={self.test_mode})')
+ return repr_str
+
+class BaseStorageBackend(metaclass=ABCMeta):
+ """Abstract class of storage backends. """
+
+ @abstractmethod
+ def get(self, filepath):
+ pass
+
+ @abstractmethod
+ def get_text(self, filepath):
+ pass
+
+class HardDiskBackend(BaseStorageBackend):
+ """Raw hard disks storage backend."""
+
+ def get(self, filepath):
+ filepath = str(filepath)
+ with open(filepath, 'rb') as f:
+ value_buf = f.read()
+ return value_buf
+
+ def get_text(self, filepath):
+ filepath = str(filepath)
+ with open(filepath, 'r') as f:
+ value_buf = f.read()
+ return value_buf
+
+class FileClient:
+ """A general file client to access files in different backend. """
+
+ _backends = {
+ 'disk': HardDiskBackend,
+ }
+
+ def __init__(self, backend='disk', **kwargs):
+ if backend not in self._backends:
+ raise ValueError(
+ f'Backend {backend} is not supported. Currently supported ones'
+ f' are {list(self._backends.keys())}')
+ self.backend = backend
+ self.client = self._backends[backend](**kwargs)
+
+ @classmethod
+ def _register_backend(cls, name, backend, force=False):
+ if not isinstance(name, str):
+ raise TypeError('the backend name should be a string, '
+ f'but got {type(name)}')
+ if not inspect.isclass(backend):
+ raise TypeError(
+ f'backend should be a class but got {type(backend)}')
+ if not issubclass(backend, BaseStorageBackend):
+ raise TypeError(
+ f'backend {backend} is not a subclass of BaseStorageBackend')
+ if not force and name in cls._backends:
+ raise KeyError(
+ f'{name} is already registered as a storage backend, '
+ 'add "force=True" if you want to override it')
+
+ cls._backends[name] = backend
+
+ @classmethod
+ def register_backend(cls, name, backend=None, force=False):
+ """Register a backend to FileClient. """
+
+ if backend is not None:
+ cls._register_backend(name, backend, force=force)
+ return
+
+ def _register(backend_cls):
+ cls._register_backend(name, backend_cls, force=force)
+ return backend_cls
+
+ return _register
+
+ def get(self, filepath):
+ return self.client.get(filepath)
+
+ def get_text(self, filepath):
+ return self.client.get_text(filepath)
+
+@PIPELINES.register()
+class RawFrameDecode:
+ """Load and decode frames with given indices. """
+
+ def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):
+ self.io_backend = io_backend
+ self.decoding_backend = decoding_backend
+ self.kwargs = kwargs
+ self.file_client = None
+
+ def _pillow2array(self,img, flag='color', channel_order='bgr'):
+ """Convert a pillow image to numpy array. """
+
+ channel_order = channel_order.lower()
+ if channel_order not in ['rgb', 'bgr']:
+ raise ValueError('channel order must be either "rgb" or "bgr"')
+
+ if flag == 'unchanged':
+ array = np.array(img)
+ if array.ndim >= 3 and array.shape[2] >= 3: # color image
+ array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR
+ else:
+ # If the image mode is not 'RGB', convert it to 'RGB' first.
+ if img.mode != 'RGB':
+ if img.mode != 'LA':
+ # Most formats except 'LA' can be directly converted to RGB
+ img = img.convert('RGB')
+ else:
+ # When the mode is 'LA', the default conversion will fill in
+ # the canvas with black, which sometimes shadows black objects
+ # in the foreground.
+ #
+ # Therefore, a random color (124, 117, 104) is used for canvas
+ img_rgba = img.convert('RGBA')
+ img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+ img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha
+ if flag == 'color':
+ array = np.array(img)
+ if channel_order != 'rgb':
+ array = array[:, :, ::-1] # RGB to BGR
+ elif flag == 'grayscale':
+ img = img.convert('L')
+ array = np.array(img)
+ else:
+ raise ValueError(
+ 'flag must be "color", "grayscale" or "unchanged", '
+ f'but got {flag}')
+ return array
+
+ def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):
+ """Read an image from bytes. """
+
+ img_np = np.frombuffer(content, np.uint8)
+ flag = imread_flags[flag] if isinstance(flag, str) else flag
+ img = cv2.imdecode(img_np, flag)
+ if flag == IMREAD_COLOR and channel_order == 'rgb':
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+ return img
+
+ def __call__(self, results):
+ """Perform the ``RawFrameDecode`` to pick frames given indices.
+
+ Args:
+ results (dict): The resulting dict to be modified and passed
+ to the next transform in pipeline.
+ """
+ # mmcv.use_backend(self.decoding_backend)
+
+ directory = results['frame_dir']
+ suffix = results['suffix']
+ #modality = results['modality']
+
+ if self.file_client is None:
+ self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+ imgs = list()
+
+ if results['frame_inds'].ndim != 1:
+ results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+ offset = results.get('offset', 0)
+
+ for frame_idx in results['frame_inds']:
+ frame_idx += offset
+ filepath = osp.join(directory, suffix.format(frame_idx))
+ img_bytes = self.file_client.get(filepath) #以二进制方式读取图片
+ # Get frame with channel order RGB directly.
+
+ cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')
+ imgs.append(cur_frame)
+
+ results['imgs'] = imgs
+ results['original_shape'] = imgs[0].shape[:2]
+ results['img_shape'] = imgs[0].shape[:2]
+
+ # we resize the gt_bboxes and proposals to their real scale
+ h, w = results['img_shape']
+ scale_factor = np.array([w, h, w, h])
+ if 'gt_bboxes' in results:
+ gt_bboxes = results['gt_bboxes']
+ gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)
+ results['gt_bboxes'] = gt_bboxes_new
+ if 'proposals' in results and results['proposals'] is not None:
+ proposals = results['proposals']
+ proposals = (proposals * scale_factor).astype(np.float32)
+ results['proposals'] = proposals
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'io_backend={self.io_backend}, '
+ f'decoding_backend={self.decoding_backend})')
+ return repr_str
+
+@PIPELINES.register()
+class SampleAVAFrames(SampleFrames):
+
+ def __init__(self, clip_len, frame_interval=2, test_mode=False):
+
+ super().__init__(clip_len, frame_interval, test_mode=test_mode)
+
+ def _get_clips(self, center_index, skip_offsets, shot_info):
+ start = center_index - (self.clip_len // 2) * self.frame_interval
+ end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval
+ frame_inds = list(range(start, end, self.frame_interval))
+ frame_inds = frame_inds + skip_offsets
+ frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)
+
+ return frame_inds
+
+ def __call__(self, results):
+ fps = results['fps']
+ timestamp = results['timestamp']
+ timestamp_start = results['timestamp_start']
+ shot_info = results['shot_info']
+
+ #delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒
+ #center_index=fps*delta为该帧距离15min视频开头有几帧
+ #center_index+1是为了避免后续采样时出现负数?
+ #后续需要以center_index为中心前后采样视频帧片段
+ center_index = fps * (timestamp - timestamp_start) + 1
+
+ skip_offsets = np.random.randint(
+ -self.frame_interval // 2, (self.frame_interval + 1) // 2,
+ size=self.clip_len)
+ frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
+
+ results['frame_inds'] = np.array(frame_inds, dtype=np.int)
+ results['clip_len'] = self.clip_len
+ results['frame_interval'] = self.frame_interval
+ results['num_clips'] = 1
+ results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'clip_len={self.clip_len}, '
+ f'frame_interval={self.frame_interval}, '
+ f'test_mode={self.test_mode})')
+ return repr_str
+
diff --git a/paddlevideo/loader/pipelines/segmentation.py b/paddlevideo/loader/pipelines/segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..247144267cc3a246754d8f4568bea346b485f116
--- /dev/null
+++ b/paddlevideo/loader/pipelines/segmentation.py
@@ -0,0 +1,130 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from PIL import Image
+import copy
+import cv2
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class MultiRestrictSize(object):
+ def __init__(self,
+ min_size=None,
+ max_size=800,
+ flip=False,
+ multi_scale=[1.3]):
+ self.min_size = min_size
+ self.max_size = max_size
+ self.multi_scale = multi_scale
+ self.flip = flip
+ assert ((min_size is None)) or ((max_size is None))
+
+ def __call__(self, sample):
+ samples = []
+ image = sample['current_img']
+ h, w = image.shape[:2]
+ for scale in self.multi_scale:
+ # Fixed range of scales
+ sc = None
+ # Align short edge
+ if not (self.min_size is None):
+ if h > w:
+ short_edge = w
+ else:
+ short_edge = h
+ if short_edge > self.min_size:
+ sc = float(self.min_size) / short_edge
+ else:
+ if h > w:
+ long_edge = h
+ else:
+ long_edge = w
+ if long_edge > self.max_size:
+ sc = float(self.max_size) / long_edge
+
+ if sc is None:
+ new_h = h
+ new_w = w
+ else:
+ new_h = sc * h
+ new_w = sc * w
+ new_h = int(new_h * scale)
+ new_w = int(new_w * scale)
+
+ if (new_h - 1) % 16 != 0:
+ new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)
+ if (new_w - 1) % 16 != 0:
+ new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)
+
+ if new_h == h and new_w == w:
+ samples.append(sample)
+ else:
+ new_sample = {}
+ for elem in sample.keys():
+ if 'meta' in elem:
+ new_sample[elem] = sample[elem]
+ continue
+ tmp = sample[elem]
+ if 'label' in elem:
+ new_sample[elem] = sample[elem]
+ continue
+ else:
+ flagval = cv2.INTER_CUBIC
+ tmp = cv2.resize(tmp,
+ dsize=(new_w, new_h),
+ interpolation=flagval)
+ new_sample[elem] = tmp
+ samples.append(new_sample)
+
+ if self.flip:
+ now_sample = samples[-1]
+ new_sample = {}
+ for elem in now_sample.keys():
+ if 'meta' in elem:
+ new_sample[elem] = now_sample[elem].copy()
+ new_sample[elem]['flip'] = True
+ continue
+ tmp = now_sample[elem]
+ tmp = tmp[:, ::-1].copy()
+ new_sample[elem] = tmp
+ samples.append(new_sample)
+
+ return samples
+
+
+@PIPELINES.register()
+class MultiNorm(object):
+ def __call__(self, samples):
+ for idx in range(len(samples)):
+ sample = samples[idx]
+ for elem in sample.keys():
+ if 'meta' in elem:
+ continue
+ tmp = sample[elem]
+ if tmp is None:
+ continue
+
+ if tmp.ndim == 2:
+ tmp = tmp[:, :, np.newaxis]
+ else:
+ tmp = tmp / 255.
+ tmp -= (0.485, 0.456, 0.406)
+ tmp /= (0.229, 0.224, 0.225)
+
+ tmp = tmp.transpose((2, 0, 1))
+ samples[idx][elem] = tmp
+
+ return samples
diff --git a/paddlevideo/loader/pipelines/segmentation_pipline.py b/paddlevideo/loader/pipelines/segmentation_pipline.py
new file mode 100644
index 0000000000000000000000000000000000000000..dda6deec4043e8caa117ddc757c90808f329e5e2
--- /dev/null
+++ b/paddlevideo/loader/pipelines/segmentation_pipline.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+import random
+import paddle
+from ..registry import PIPELINES
+"""
+pipeline ops for Action Segmentation Dataset.
+"""
+
+
+@PIPELINES.register()
+class SegmentationSampler(object):
+
+ def __init__(self, sample_rate):
+ self.sample_rate = sample_rate
+
+ def __call__(self, results):
+ for key, data in results.items():
+ if len(data.shape) == 1:
+ data = data[::self.sample_rate]
+ results[key] = copy.deepcopy(data)
+ else:
+ data = data[:, ::self.sample_rate]
+ results[key] = copy.deepcopy(data)
+ return results
diff --git a/paddlevideo/loader/pipelines/skeleton_pipeline.py b/paddlevideo/loader/pipelines/skeleton_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..105a3eef4db45e373b6b5fd9dabbe9dddc48256e
--- /dev/null
+++ b/paddlevideo/loader/pipelines/skeleton_pipeline.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle.nn.functional as F
+import random
+import paddle
+from ..registry import PIPELINES
+"""pipeline ops for Activity Net.
+"""
+
+
+@PIPELINES.register()
+class AutoPadding(object):
+ """
+ Sample or Padding frame skeleton feature.
+ Args:
+ window_size: int, temporal size of skeleton feature.
+ random_pad: bool, whether do random padding when frame length < window size. Default: False.
+ """
+
+ def __init__(self, window_size, random_pad=False):
+ self.window_size = window_size
+ self.random_pad = random_pad
+
+ def get_frame_num(self, data):
+ C, T, V, M = data.shape
+ for i in range(T - 1, -1, -1):
+ tmp = np.sum(data[:, i, :, :])
+ if tmp > 0:
+ T = i + 1
+ break
+ return T
+
+ def __call__(self, results):
+ data = results['data']
+
+ C, T, V, M = data.shape
+ T = self.get_frame_num(data)
+ if T == self.window_size:
+ data_pad = data[:, :self.window_size, :, :]
+ elif T < self.window_size:
+ begin = random.randint(0, self.window_size -
+ T) if self.random_pad else 0
+ data_pad = np.zeros((C, self.window_size, V, M))
+ data_pad[:, begin:begin + T, :, :] = data[:, :T, :, :]
+ else:
+ if self.random_pad:
+ index = np.random.choice(T, self.window_size,
+ replace=False).astype('int64')
+ else:
+ index = np.linspace(0, T, self.window_size).astype("int64")
+ data_pad = data[:, index, :, :]
+
+ results['data'] = data_pad
+ return results
+
+
+@PIPELINES.register()
+class SkeletonNorm(object):
+ """
+ Normalize skeleton feature.
+ Args:
+ aixs: dimensions of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default: 2.
+ """
+
+ def __init__(self, axis=2, squeeze=False):
+ self.axis = axis
+ self.squeeze = squeeze
+
+ def __call__(self, results):
+ data = results['data']
+
+ # Centralization
+ data = data - data[:, :, 8:9, :]
+ data = data[:self.axis, :, :, :] # get (x,y) from (x,y, acc)
+ C, T, V, M = data.shape
+ if self.squeeze:
+ data = data.reshape((C, T, V)) # M = 1
+
+ results['data'] = data.astype('float32')
+ if 'label' in results:
+ label = results['label']
+ results['label'] = np.expand_dims(label, 0).astype('int64')
+ return results
+
+
+@PIPELINES.register()
+class Iden(object):
+ """
+ Wrapper Pipeline
+ """
+
+ def __init__(self, label_expand=True):
+ self.label_expand = label_expand
+
+ def __call__(self, results):
+ data = results['data']
+ results['data'] = data.astype('float32')
+
+ if 'label' in results and self.label_expand:
+ label = results['label']
+ results['label'] = np.expand_dims(label, 0).astype('int64')
+ return results
+
+
+@PIPELINES.register()
+class RandomRotation(object):
+ """
+ Random rotation sketeton.
+ Args:
+ argument: bool, if rotation.
+ theta: float, rotation rate.
+ """
+
+ def __init__(self, argument, theta=0.3):
+ self.theta = theta
+ self.argument = argument
+
+ def _rot(self, rot):
+ """
+ rot: T,3
+ """
+ cos_r, sin_r = np.cos(rot), np.sin(rot) # T,3
+ zeros = np.zeros((rot.shape[0], 1)) # T,1
+ ones = np.ones((rot.shape[0], 1)) # T,1
+
+ r1 = np.stack((ones, zeros, zeros), axis=-1) # T,1,3
+ rx2 = np.stack((zeros, cos_r[:, 0:1], sin_r[:, 0:1]), axis=-1) # T,1,3
+ rx3 = np.stack((zeros, -sin_r[:, 0:1], cos_r[:, 0:1]), axis=-1) # T,1,3
+ rx = np.concatenate((r1, rx2, rx3), axis=1) # T,3,3
+
+ ry1 = np.stack((cos_r[:, 1:2], zeros, -sin_r[:, 1:2]), axis=-1)
+ r2 = np.stack((zeros, ones, zeros), axis=-1)
+ ry3 = np.stack((sin_r[:, 1:2], zeros, cos_r[:, 1:2]), axis=-1)
+ ry = np.concatenate((ry1, r2, ry3), axis=1)
+
+ rz1 = np.stack((cos_r[:, 2:3], sin_r[:, 2:3], zeros), axis=-1)
+ r3 = np.stack((zeros, zeros, ones), axis=-1)
+ rz2 = np.stack((-sin_r[:, 2:3], cos_r[:, 2:3], zeros), axis=-1)
+ rz = np.concatenate((rz1, rz2, r3), axis=1)
+
+ rot = np.matmul(np.matmul(rz, ry), rx)
+ return rot
+
+ def __call__(self, results):
+ # C,T,V,M
+ data = results['data']
+ if self.argument:
+ C, T, V, M = data.shape
+ data_numpy = np.transpose(data, (1, 0, 2, 3)).conjugate().reshape(
+ T, C, V * M) # T,3,V*M
+ rot = np.random.uniform(-self.theta, self.theta, 3)
+ rot = np.stack([
+ rot,
+ ] * T, axis=0)
+ rot = self._rot(rot) # T,3,3
+ data_numpy = np.matmul(rot, data_numpy)
+ data_numpy = data_numpy.reshape(T, C, V, M)
+ data_numpy = np.transpose(data_numpy, (1, 0, 2, 3))
+ data = data_numpy
+ results['data'] = data.astype(np.float32)
+ return results
+
+
+@PIPELINES.register()
+class SketeonCropSample(object):
+ """
+ Sketeon Crop Sampler.
+ Args:
+ crop_model: str, crop model, support: ['center'].
+ p_interval: list, crop len
+ window_size: int, sample windows size.
+ """
+
+ def __init__(self, window_size, crop_model='center', p_interval=1):
+ assert crop_model in ['center'], "Don't support :" + crop_model
+
+ self.crop_model = crop_model
+ self.window_size = window_size
+ self.p_interval = p_interval
+
+ def __call__(self, results):
+ if self.crop_model == 'center':
+ # input: C,T,V,M
+ data = results['data']
+ valid_frame_num = np.sum(data.sum(0).sum(-1).sum(-1) != 0)
+
+ C, T, V, M = data.shape
+ begin = 0
+ end = valid_frame_num
+ valid_size = end - begin
+
+ #crop
+ if len(self.p_interval) == 1:
+ p = self.p_interval[0]
+ bias = int((1 - p) * valid_size / 2)
+ data = data[:, begin + bias:end - bias, :, :] # center_crop
+ cropped_length = data.shape[1]
+ else:
+ p = np.random.rand(1) * (self.p_interval[1] - self.p_interval[0]
+ ) + self.p_interval[0]
+ # constraint cropped_length lower bound as 64
+ cropped_length = np.minimum(
+ np.maximum(int(np.floor(valid_size * p)), 64), valid_size)
+ bias = np.random.randint(0, valid_size - cropped_length + 1)
+ data = data[:, begin + bias:begin + bias + cropped_length, :, :]
+
+ # resize
+ data = np.transpose(data, (0, 2, 3, 1)).conjugate().reshape(
+ C * V * M, cropped_length)
+ data = data[None, None, :, :]
+ # could perform both up sample and down sample
+ data_tensor = paddle.to_tensor(data)
+ data_tensor = F.interpolate(data_tensor,
+ size=(C * V * M, self.window_size),
+ mode='bilinear',
+ align_corners=False).squeeze()
+ data = paddle.transpose(
+ paddle.reshape(data_tensor, (C, V, M, self.window_size)),
+ (0, 3, 1, 2)).numpy()
+ else:
+ raise NotImplementedError
+ results['data'] = data
+ return results
+
+
+@PIPELINES.register()
+class SketeonModalityTransform(object):
+ """
+ Sketeon Crop Sampler.
+ Args:
+ crop_model: str, crop model, support: ['center'].
+ p_interval: list, crop len
+ window_size: int, sample windows size.
+ """
+
+ def __init__(self, bone, motion, joint=True, graph='ntu_rgb_d'):
+
+ self.joint = joint
+ self.bone = bone
+ self.motion = motion
+ self.graph = graph
+ if self.graph == "ntu_rgb_d":
+ self.bone_pairs = ((1, 2), (2, 21), (3, 21), (4, 3), (5, 21),
+ (6, 5), (7, 6), (8, 7), (9, 21), (10, 9),
+ (11, 10), (12, 11), (13, 1), (14, 13), (15, 14),
+ (16, 15), (17, 1), (18, 17), (19, 18), (20, 19),
+ (22, 23), (21, 21), (23, 8), (24, 25), (25, 12))
+ else:
+ raise NotImplementedError
+
+ def __call__(self, results):
+ if self.joint:
+ return results
+ data_numpy = results['data']
+ if self.bone:
+ bone_data_numpy = np.zeros_like(data_numpy)
+ for v1, v2 in self.bone_pairs:
+ bone_data_numpy[:, :, v1 -
+ 1] = data_numpy[:, :, v1 -
+ 1] - data_numpy[:, :, v2 - 1]
+ data_numpy = bone_data_numpy
+ if self.motion:
+ data_numpy[:, :-1] = data_numpy[:, 1:] - data_numpy[:, :-1]
+ data_numpy[:, -1] = 0
+ results['data'] = data_numpy
+ return results
diff --git a/paddlevideo/loader/registry.py b/paddlevideo/loader/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..add663104d66b0f9ceadc2195f654d880b57f998
--- /dev/null
+++ b/paddlevideo/loader/registry.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+PIPELINES = Registry("pipeline")
+DATASETS = Registry("datasets")
diff --git a/paddlevideo/metrics/ActivityNet/__init__.py b/paddlevideo/metrics/ActivityNet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eefabbd72b3541cd3d6daeab01811065a4c9bd82
--- /dev/null
+++ b/paddlevideo/metrics/ActivityNet/__init__.py
@@ -0,0 +1,3 @@
+from .anet_prop import ANETproposal
+
+__all__ = ['ANETproposal']
diff --git a/paddlevideo/metrics/ActivityNet/anet_prop.py b/paddlevideo/metrics/ActivityNet/anet_prop.py
new file mode 100644
index 0000000000000000000000000000000000000000..411b164f980655dbcf713915b0df320426db9be7
--- /dev/null
+++ b/paddlevideo/metrics/ActivityNet/anet_prop.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import json
+import numpy as np
+import pandas as pd
+import urllib.request as urllib2
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class ANETproposal(object):
+ """
+ This class is used for calculating AR@N and AUC;
+ Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)
+ """
+ GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']
+ PROPOSAL_FIELDS = ['results', 'version', 'external_data']
+ API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'
+
+ def __init__(self,
+ ground_truth_filename=None,
+ proposal_filename=None,
+ ground_truth_fields=GROUND_TRUTH_FIELDS,
+ proposal_fields=PROPOSAL_FIELDS,
+ tiou_thresholds=np.linspace(0.5, 0.95, 10),
+ max_avg_nr_proposals=None,
+ subset='validation',
+ verbose=False,
+ check_status=True):
+ if not ground_truth_filename:
+ raise IOError('Please input a valid ground truth file.')
+ if not proposal_filename:
+ raise IOError('Please input a valid proposal file.')
+ self.subset = subset
+ self.tiou_thresholds = tiou_thresholds
+ self.max_avg_nr_proposals = max_avg_nr_proposals
+ self.verbose = verbose
+ self.gt_fields = ground_truth_fields
+ self.pred_fields = proposal_fields
+ self.recall = None
+ self.avg_recall = None
+ self.proposals_per_video = None
+ self.check_status = check_status
+ # Retrieve blocked videos from server.
+ if self.check_status:
+ self.blocked_videos = self.get_blocked_videos()
+ else:
+ self.blocked_videos = list()
+ # Import ground truth and proposals.
+ self.ground_truth, self.activity_index = self._import_ground_truth(
+ ground_truth_filename)
+ self.proposal = self._import_proposal(proposal_filename)
+
+ if self.verbose:
+ print('[INIT] Loaded annotations from {} subset.'.format(subset))
+ nr_gt = len(self.ground_truth)
+ print('\tNumber of ground truth instances: {}'.format(nr_gt))
+ nr_pred = len(self.proposal)
+ print('\tNumber of proposals: {}'.format(nr_pred))
+ print('\tFixed threshold for tiou score: {}'.format(
+ self.tiou_thresholds))
+
+ def _import_ground_truth(self, ground_truth_filename):
+ """
+ Reads ground truth file, checks if it is well formatted, and returns
+ the ground truth instances and the activity classes.
+
+ Parameters:
+ ground_truth_filename (str): full path to the ground truth json file.
+ Returns:
+ ground_truth (df): Data frame containing the ground truth instances.
+ activity_index (dict): Dictionary containing class index.
+ """
+ with open(ground_truth_filename, 'r') as fobj:
+ data = json.load(fobj)
+ # Checking format
+ if not all([field in data.keys() for field in self.gt_fields]):
+ raise IOError('Please input a valid ground truth file.')
+
+ # Read ground truth data.
+ activity_index, cidx = {}, 0
+ video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
+ for videoid, v in data['database'].items():
+ if self.subset != v['subset']:
+ continue
+ if videoid in self.blocked_videos:
+ continue
+ for ann in v['annotations']:
+ if ann['label'] not in activity_index:
+ activity_index[ann['label']] = cidx
+ cidx += 1
+ video_lst.append(videoid)
+ t_start_lst.append(float(ann['segment'][0]))
+ t_end_lst.append(float(ann['segment'][1]))
+ label_lst.append(activity_index[ann['label']])
+
+ ground_truth = pd.DataFrame({
+ 'video-id': video_lst,
+ 't-start': t_start_lst,
+ 't-end': t_end_lst,
+ 'label': label_lst
+ })
+ return ground_truth, activity_index
+
+ def _import_proposal(self, proposal_filename):
+ """
+ Reads proposal file, checks if it is well formatted, and returns
+ the proposal instances.
+
+ Parameters:
+ proposal_filename (str): Full path to the proposal json file.
+ Returns:
+ proposal (df): Data frame containing the proposal instances.
+ """
+ with open(proposal_filename, 'r') as fobj:
+ data = json.load(fobj)
+ # Checking format...
+ if not all([field in data.keys() for field in self.pred_fields]):
+ raise IOError('Please input a valid proposal file.')
+
+ # Read predictions.
+ video_lst, t_start_lst, t_end_lst = [], [], []
+ score_lst = []
+ for videoid, v in data['results'].items():
+ if videoid in self.blocked_videos:
+ continue
+ for result in v:
+ video_lst.append(videoid)
+ t_start_lst.append(float(result['segment'][0]))
+ t_end_lst.append(float(result['segment'][1]))
+ score_lst.append(result['score'])
+ proposal = pd.DataFrame({
+ 'video-id': video_lst,
+ 't-start': t_start_lst,
+ 't-end': t_end_lst,
+ 'score': score_lst
+ })
+ return proposal
+
+ def evaluate(self):
+ """
+ Evaluates a proposal file. To measure the performance of a
+ method for the proposal task, we computes the area under the
+ average recall vs average number of proposals per video curve.
+ """
+ recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(
+ self.ground_truth,
+ self.proposal,
+ max_avg_nr_proposals=self.max_avg_nr_proposals,
+ tiou_thresholds=self.tiou_thresholds)
+
+ area_under_curve = np.trapz(avg_recall, proposals_per_video)
+
+ if self.verbose:
+ print('[RESULTS] Performance on ActivityNet proposal task.')
+ with open("data/bmn/BMN_Test_results/auc_result.txt",
+ "a") as text_file:
+ text_file.write(
+ '\tArea Under the AR vs AN curve: {}% \n'.format(
+ 100. * float(area_under_curve) /
+ proposals_per_video[-1]))
+ print('\tArea Under the AR vs AN curve: {}%'.format(
+ 100. * float(area_under_curve) / proposals_per_video[-1]))
+
+ self.recall = recall
+ self.avg_recall = avg_recall
+ self.proposals_per_video = proposals_per_video
+
+ def average_recall_vs_avg_nr_proposals(self,
+ ground_truth,
+ proposals,
+ max_avg_nr_proposals=None,
+ tiou_thresholds=np.linspace(
+ 0.5, 0.95, 10)):
+ """
+ Computes the average recall given an average number of
+ proposals per video.
+
+ Parameters:
+ ground_truth(df): Data frame containing the ground truth instances.
+ Required fields: ['video-id', 't-start', 't-end']
+ proposal(df): Data frame containing the proposal instances.
+ Required fields: ['video-id, 't-start', 't-end', 'score']
+ tiou_thresholds(1d-array | optional): array with tiou thresholds.
+
+ Returns:
+ recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth
+ average number of average number of proposals per video.
+ average_recall(1d-array): recall averaged over a list of tiou threshold.
+ This is equivalent to recall.mean(axis=0).
+ proposals_per_video(1d-array): average number of proposals per video.
+ """
+
+ # Get list of videos.
+ video_lst = ground_truth['video-id'].unique()
+
+ if not max_avg_nr_proposals:
+ max_avg_nr_proposals = float(
+ proposals.shape[0]) / video_lst.shape[0]
+
+ ratio = max_avg_nr_proposals * float(
+ video_lst.shape[0]) / proposals.shape[0]
+
+ # Adaptation to query faster
+ ground_truth_gbvn = ground_truth.groupby('video-id')
+ proposals_gbvn = proposals.groupby('video-id')
+
+ # For each video, computes tiou scores among the retrieved proposals.
+ score_lst = []
+ total_nr_proposals = 0
+ for videoid in video_lst:
+ # Get ground-truth instances associated to this video.
+ ground_truth_videoid = ground_truth_gbvn.get_group(videoid)
+ this_video_ground_truth = ground_truth_videoid.loc[:, [
+ 't-start', 't-end'
+ ]].values
+
+ # Get proposals for this video.
+ try:
+ proposals_videoid = proposals_gbvn.get_group(videoid)
+ except:
+ n = this_video_ground_truth.shape[0]
+ score_lst.append(np.zeros((n, 1)))
+ continue
+
+ this_video_proposals = proposals_videoid.loc[:,
+ ['t-start', 't-end'
+ ]].values
+
+ if this_video_proposals.shape[0] == 0:
+ n = this_video_ground_truth.shape[0]
+ score_lst.append(np.zeros((n, 1)))
+ continue
+
+ # Sort proposals by score.
+ sort_idx = proposals_videoid['score'].argsort()[::-1]
+ this_video_proposals = this_video_proposals[sort_idx, :]
+
+ if this_video_proposals.ndim != 2:
+ this_video_proposals = np.expand_dims(this_video_proposals,
+ axis=0)
+ if this_video_ground_truth.ndim != 2:
+ this_video_ground_truth = np.expand_dims(
+ this_video_ground_truth, axis=0)
+
+ nr_proposals = np.minimum(
+ int(this_video_proposals.shape[0] * ratio),
+ this_video_proposals.shape[0])
+ total_nr_proposals += nr_proposals
+ this_video_proposals = this_video_proposals[:nr_proposals, :]
+
+ # Compute tiou scores.
+ tiou = self.wrapper_segment_iou(this_video_proposals,
+ this_video_ground_truth)
+ score_lst.append(tiou)
+
+ # Given that the length of the videos is really varied, we
+ # compute the number of proposals in terms of a ratio of the total
+ # proposals retrieved, i.e. average recall at a percentage of proposals
+ # retrieved per video.
+
+ # Computes average recall.
+ pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(
+ video_lst.shape[0]) / total_nr_proposals)
+ matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))
+ positives = np.empty(video_lst.shape[0])
+ recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))
+ # Iterates over each tiou threshold.
+ for ridx, tiou in enumerate(tiou_thresholds):
+
+ # Inspect positives retrieved per video at different
+ # number of proposals (percentage of the total retrieved).
+ for i, score in enumerate(score_lst):
+ # Total positives per video.
+ positives[i] = score.shape[0]
+ # Find proposals that satisfies minimum tiou threshold.
+ true_positives_tiou = score >= tiou
+ # Get number of proposals as a percentage of total retrieved.
+ pcn_proposals = np.minimum(
+ (score.shape[1] * pcn_lst).astype(int), score.shape[1])
+
+ for j, nr_proposals in enumerate(pcn_proposals):
+ # Compute the number of matches for each percentage of the proposals
+ matches[i, j] = np.count_nonzero(
+ (true_positives_tiou[:, :nr_proposals]).sum(axis=1))
+
+ # Computes recall given the set of matches per video.
+ recall[ridx, :] = matches.sum(axis=0) / positives.sum()
+
+ # Recall is averaged.
+ avg_recall = recall.mean(axis=0)
+
+ # Get the average number of proposals per video.
+ proposals_per_video = pcn_lst * (float(total_nr_proposals) /
+ video_lst.shape[0])
+
+ return recall, avg_recall, proposals_per_video
+
+ def get_blocked_videos(self, api=API):
+ api_url = '{}?action=get_blocked'.format(api)
+ req = urllib2.Request(api_url)
+ response = urllib2.urlopen(req)
+ return json.loads(response.read())
+
+ def wrapper_segment_iou(self, target_segments, candidate_segments):
+ """
+ Compute intersection over union btw segments
+ Parameters:
+ target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]
+ candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]
+ Returns:
+ tiou(nd-array): 2-dim array [n x m] with IOU ratio.
+ Note: It assumes that candidate-segments are more scarce that target-segments
+ """
+ if candidate_segments.ndim != 2 or target_segments.ndim != 2:
+ raise ValueError('Dimension of arguments is incorrect')
+
+ n, m = candidate_segments.shape[0], target_segments.shape[0]
+ tiou = np.empty((n, m))
+ for i in range(m):
+ tiou[:, i] = self.segment_iou(target_segments[i, :],
+ candidate_segments)
+
+ return tiou
+
+ def segment_iou(self, target_segment, candidate_segments):
+ """
+ Compute the temporal intersection over union between a
+ target segment and all the test segments.
+
+ Parameters:
+ target_segment(1d-array): Temporal target segment containing [starting, ending] times.
+ candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.
+
+ Returns:
+ tiou(1d-array): Temporal intersection over union score of the N's candidate segments.
+ """
+ tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
+ tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
+ # Intersection including Non-negative overlap score.
+ segments_intersection = (tt2 - tt1).clip(0)
+ # Segment union.
+ segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
+ + (target_segment[1] - target_segment[0]) - segments_intersection
+ # Compute overlap as the ratio of the intersection
+ # over union of two segments.
+ tIoU = segments_intersection.astype(float) / segments_union
+ return tIoU
diff --git a/paddlevideo/metrics/__init__.py b/paddlevideo/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f979f27fe1e6977ce35193824fbc48cf4268bf08
--- /dev/null
+++ b/paddlevideo/metrics/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bmn_metric import BMNMetric
+from .build import build_metric
+from .center_crop_metric import CenterCropMetric
+from .depth_metric import DepthMetric
+from .msrvtt_metric import MSRVTTMetric
+from .multi_crop_metric import MultiCropMetric
+from .registry import METRIC
+from .skeleton_metric import SkeletonMetric
+from .transnetv2_metric import TransNetV2Metric
+from .youtube8m.eval_util import HitOneMetric
+from .segmentation_metric import SegmentationMetric
+from .ava_metric import AVAMetric
+from .vos_metric import VOSMetric
+from .center_crop_metric_MRI import CenterCropMetric_MRI
+
+__all__ = [
+ 'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',
+ 'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',
+ 'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric', 'SegmentationMetric'
+]
diff --git a/paddlevideo/metrics/ava_evaluation/README.md b/paddlevideo/metrics/ava_evaluation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7414d0fbbd32d24d1e1b745d1df6a3fd2a2c2a43
--- /dev/null
+++ b/paddlevideo/metrics/ava_evaluation/README.md
@@ -0,0 +1,2 @@
+The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).
+Some unused codes are removed to minimize the length of codes added.
diff --git a/paddlevideo/metrics/ava_evaluation/__init__.py b/paddlevideo/metrics/ava_evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddlevideo/metrics/ava_evaluation/metrics.py b/paddlevideo/metrics/ava_evaluation/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..13eb034697be05a4c8030e4c1f93ece73a5bab1e
--- /dev/null
+++ b/paddlevideo/metrics/ava_evaluation/metrics.py
@@ -0,0 +1,143 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Functions for computing metrics like precision, recall, CorLoc and etc."""
+
+import numpy as np
+
+
+def compute_precision_recall(scores, labels, num_gt):
+ """Compute precision and recall.
+
+ Args:
+ scores: A float numpy array representing detection score
+ labels: A boolean numpy array representing true/false positive labels
+ num_gt: Number of ground truth instances
+
+ Raises:
+ ValueError: if the input is not of the correct format
+
+ Returns:
+ precision: Fraction of positive instances over detected ones. This
+ value is None if no ground truth labels are present.
+ recall: Fraction of detected positive instance over all positive
+ instances. This value is None if no ground truth labels are
+ present.
+ """
+ if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool
+ or len(labels.shape) != 1):
+ raise ValueError('labels must be single dimension bool numpy array')
+
+ if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:
+ raise ValueError('scores must be single dimension numpy array')
+
+ if num_gt < np.sum(labels):
+ raise ValueError(
+ 'Number of true positives must be smaller than num_gt.')
+
+ if len(scores) != len(labels):
+ raise ValueError('scores and labels must be of the same size.')
+
+ if num_gt == 0:
+ return None, None
+
+ sorted_indices = np.argsort(scores)
+ sorted_indices = sorted_indices[::-1]
+ labels = labels.astype(int)
+ true_positive_labels = labels[sorted_indices]
+ false_positive_labels = 1 - true_positive_labels
+ cum_true_positives = np.cumsum(true_positive_labels)
+ cum_false_positives = np.cumsum(false_positive_labels)
+ precision = cum_true_positives.astype(float) / (
+ cum_true_positives + cum_false_positives)
+ recall = cum_true_positives.astype(float) / num_gt
+ return precision, recall
+
+
+def compute_average_precision(precision, recall):
+ """Compute Average Precision according to the definition in VOCdevkit.
+
+ Precision is modified to ensure that it does not decrease as recall
+ decrease.
+
+ Args:
+ precision: A float [N, 1] numpy array of precisions
+ recall: A float [N, 1] numpy array of recalls
+
+ Raises:
+ ValueError: if the input is not of the correct format
+
+ Returns:
+ average_precison: The area under the precision recall curve. NaN if
+ precision and recall are None.
+ """
+ if precision is None:
+ if recall is not None:
+ raise ValueError('If precision is None, recall must also be None')
+ return np.NAN
+
+ if not isinstance(precision, np.ndarray) or not isinstance(
+ recall, np.ndarray):
+ raise ValueError('precision and recall must be numpy array')
+ if precision.dtype != np.float or recall.dtype != np.float:
+ raise ValueError('input must be float numpy array.')
+ if len(precision) != len(recall):
+ raise ValueError('precision and recall must be of the same size.')
+ if not precision.size:
+ return 0.0
+ if np.amin(precision) < 0 or np.amax(precision) > 1:
+ raise ValueError('Precision must be in the range of [0, 1].')
+ if np.amin(recall) < 0 or np.amax(recall) > 1:
+ raise ValueError('recall must be in the range of [0, 1].')
+ if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
+ raise ValueError('recall must be a non-decreasing array')
+
+ recall = np.concatenate([[0], recall, [1]])
+ precision = np.concatenate([[0], precision, [0]])
+
+ # Preprocess precision to be a non-decreasing array
+ for i in range(len(precision) - 2, -1, -1):
+ precision[i] = np.maximum(precision[i], precision[i + 1])
+
+ indices = np.where(recall[1:] != recall[:-1])[0] + 1
+ average_precision = np.sum(
+ (recall[indices] - recall[indices - 1]) * precision[indices])
+ return average_precision
+
+
+def compute_cor_loc(num_gt_imgs_per_class,
+ num_images_correctly_detected_per_class):
+ """Compute CorLoc according to the definition in the following paper.
+
+ https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
+
+ Returns nans if there are no ground truth images for a class.
+
+ Args:
+ num_gt_imgs_per_class: 1D array, representing number of images
+ containing at least one object instance of a particular class
+ num_images_correctly_detected_per_class: 1D array, representing number
+ of images that are correctly detected at least one object instance
+ of a particular class
+
+ Returns:
+ corloc_per_class: A float numpy array represents the corloc score of
+ each class
+ """
+ # Divide by zero expected for classes with no gt examples.
+ with np.errstate(divide='ignore', invalid='ignore'):
+ return np.where(
+ num_gt_imgs_per_class == 0, np.nan,
+ num_images_correctly_detected_per_class / num_gt_imgs_per_class)
diff --git a/paddlevideo/metrics/ava_evaluation/np_box_list.py b/paddlevideo/metrics/ava_evaluation/np_box_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b101e6f5d5dffdbf37c0986a93f1a3c71e0c48
--- /dev/null
+++ b/paddlevideo/metrics/ava_evaluation/np_box_list.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Numpy BoxList classes and functions."""
+
+import numpy as np
+
+
+class BoxList:
+ """Box collection.
+
+ BoxList represents a list of bounding boxes as numpy array, where each
+ bounding box is represented as a row of 4 numbers,
+ [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within
+ a given list correspond to a single image.
+
+ Optionally, users can add additional related fields (such as
+ objectness/classification scores).
+ """
+
+ def __init__(self, data):
+ """Constructs box collection.
+
+ Args:
+ data: a numpy array of shape [N, 4] representing box coordinates
+
+ Raises:
+ ValueError: if bbox data is not a numpy array
+ ValueError: if invalid dimensions for bbox data
+ """
+ if not isinstance(data, np.ndarray):
+ raise ValueError('data must be a numpy array.')
+ if len(data.shape) != 2 or data.shape[1] != 4:
+ raise ValueError('Invalid dimensions for box data.')
+ if data.dtype != np.float32 and data.dtype != np.float64:
+ raise ValueError(
+ 'Invalid data type for box data: float is required.')
+ if not self._is_valid_boxes(data):
+ raise ValueError('Invalid box data. data must be a numpy array of '
+ 'N*[y_min, x_min, y_max, x_max]')
+ self.data = {'boxes': data}
+
+ def num_boxes(self):
+ """Return number of boxes held in collections."""
+ return self.data['boxes'].shape[0]
+
+ def get_extra_fields(self):
+ """Return all non-box fields."""
+ return [k for k in self.data if k != 'boxes']
+
+ def has_field(self, field):
+ return field in self.data
+
+ def add_field(self, field, field_data):
+ """Add data to a specified field.
+
+ Args:
+ field: a string parameter used to speficy a related field to be
+ accessed.
+ field_data: a numpy array of [N, ...] representing the data
+ associated with the field.
+ Raises:
+ ValueError: if the field is already exist or the dimension of the
+ field data does not matches the number of boxes.
+ """
+ if self.has_field(field):
+ raise ValueError('Field ' + field + 'already exists')
+ if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(
+ ):
+ raise ValueError('Invalid dimensions for field data')
+ self.data[field] = field_data
+
+ def get(self):
+ """Convenience function for accesssing box coordinates.
+
+ Returns:
+ a numpy array of shape [N, 4] representing box corners
+ """
+ return self.get_field('boxes')
+
+ def get_field(self, field):
+ """Accesses data associated with the specified field in the box
+ collection.
+
+ Args:
+ field: a string parameter used to speficy a related field to be
+ accessed.
+
+ Returns:
+ a numpy 1-d array representing data of an associated field
+
+ Raises:
+ ValueError: if invalid field
+ """
+ if not self.has_field(field):
+ raise ValueError(f'field {field} does not exist')
+ return self.data[field]
+
+ def get_coordinates(self):
+ """Get corner coordinates of boxes.
+
+ Returns:
+ a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
+ """
+ box_coordinates = self.get()
+ y_min = box_coordinates[:, 0]
+ x_min = box_coordinates[:, 1]
+ y_max = box_coordinates[:, 2]
+ x_max = box_coordinates[:, 3]
+ return [y_min, x_min, y_max, x_max]
+
+ def _is_valid_boxes(self, data):
+ """Check whether data fullfills the format of N*[ymin, xmin, ymax,
+ xmin].
+
+ Args:
+ data: a numpy array of shape [N, 4] representing box coordinates
+
+ Returns:
+ a boolean indicating whether all ymax of boxes are equal or greater
+ than ymin, and all xmax of boxes are equal or greater than xmin.
+ """
+ if len(data):
+ for v in data:
+ if v[0] > v[2] or v[1] > v[3]:
+ return False
+ return True
diff --git a/paddlevideo/metrics/ava_evaluation/np_box_ops.py b/paddlevideo/metrics/ava_evaluation/np_box_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e7d300c80195f8a0299fbf33000dba9719bb0d
--- /dev/null
+++ b/paddlevideo/metrics/ava_evaluation/np_box_ops.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for [N, 4] numpy arrays representing bounding boxes.
+
+Example box operations that are supported:
+ * Areas: compute bounding box areas
+ * IOU: pairwise intersection-over-union scores
+"""
+
+import numpy as np
+
+
+def area(boxes):
+ """Computes area of boxes.
+
+ Args:
+ boxes: Numpy array with shape [N, 4] holding N boxes
+
+ Returns:
+ a numpy array with shape [N*1] representing box areas
+ """
+ return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def intersection(boxes1, boxes2):
+ """Compute pairwise intersection areas between boxes.
+
+ Args:
+ boxes1: a numpy array with shape [N, 4] holding N boxes
+ boxes2: a numpy array with shape [M, 4] holding M boxes
+
+ Returns:
+ a numpy array with shape [N*M] representing pairwise intersection area
+ """
+ [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
+ [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+
+ all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
+ all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
+ intersect_heights = np.maximum(
+ np.zeros(all_pairs_max_ymin.shape),
+ all_pairs_min_ymax - all_pairs_max_ymin)
+ all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
+ all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
+ intersect_widths = np.maximum(
+ np.zeros(all_pairs_max_xmin.shape),
+ all_pairs_min_xmax - all_pairs_max_xmin)
+ return intersect_heights * intersect_widths
+
+
+def iou(boxes1, boxes2):
+ """Computes pairwise intersection-over-union between box collections.
+
+ Args:
+ boxes1: a numpy array with shape [N, 4] holding N boxes.
+ boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+ Returns:
+ a numpy array with shape [N, M] representing pairwise iou scores.
+ """
+ intersect = intersection(boxes1, boxes2)
+ area1 = area(boxes1)
+ area2 = area(boxes2)
+ union = (
+ np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) -
+ intersect)
+ return intersect / union
+
+
+def ioa(boxes1, boxes2):
+ """Computes pairwise intersection-over-area between box collections.
+
+ Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
+ their intersection area over box2's area. Note that ioa is not symmetric,
+ that is, IOA(box1, box2) != IOA(box2, box1).
+
+ Args:
+ boxes1: a numpy array with shape [N, 4] holding N boxes.
+ boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+ Returns:
+ a numpy array with shape [N, M] representing pairwise ioa scores.
+ """
+ intersect = intersection(boxes1, boxes2)
+ areas = np.expand_dims(area(boxes2), axis=0)
+ return intersect / areas
diff --git a/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py b/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f00540f936c7e1993c41c0977527dcb125369e
--- /dev/null
+++ b/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py
@@ -0,0 +1,658 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""object_detection_evaluation module.
+
+ObjectDetectionEvaluation is a class which manages ground truth information of
+a object detection dataset, and computes frequently used detection metrics such
+as Precision, Recall, CorLoc of the provided detection results.
+It supports the following operations:
+1) Add ground truth information of images sequentially.
+2) Add detection result of images sequentially.
+3) Evaluate detection metrics on already inserted detection results.
+4) Write evaluation result into a pickle file for future processing or
+ visualization.
+
+Note: This module operates on numpy boxes and box lists.
+"""
+import collections
+import logging
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+
+from . import metrics, per_image_evaluation, standard_fields
+
+
+class DetectionEvaluator:
+ """Interface for object detection evalution classes.
+
+ Example usage of the Evaluator:
+ ------------------------------
+ evaluator = DetectionEvaluator(categories)
+
+ # Detections and groundtruth for image 1.
+ evaluator.add_single_groundtruth_image_info(...)
+ evaluator.add_single_detected_image_info(...)
+
+ # Detections and groundtruth for image 2.
+ evaluator.add_single_groundtruth_image_info(...)
+ evaluator.add_single_detected_image_info(...)
+
+ metrics_dict = evaluator.evaluate()
+ """
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, categories):
+ """Constructor.
+
+ Args:
+ categories: A list of dicts, each of which has the following keys -
+ 'id': (required) an integer id uniquely identifying this
+ category.
+ 'name': (required) string representing category name e.g.,
+ 'cat', 'dog'.
+ """
+ self._categories = categories
+
+ @abstractmethod
+ def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+ """Adds groundtruth for a single image to be used for evaluation.
+
+ Args:
+ image_id: A unique string/integer identifier for the image.
+ groundtruth_dict: A dictionary of groundtruth numpy arrays required
+ for evaluations.
+ """
+
+ @abstractmethod
+ def add_single_detected_image_info(self, image_id, detections_dict):
+ """Adds detections for a single image to be used for evaluation.
+
+ Args:
+ image_id: A unique string/integer identifier for the image.
+ detections_dict: A dictionary of detection numpy arrays required
+ for evaluation.
+ """
+
+ @abstractmethod
+ def evaluate(self):
+ """Evaluates detections and returns a dictionary of metrics."""
+
+ @abstractmethod
+ def clear(self):
+ """Clears the state to prepare for a fresh evaluation."""
+
+
+class ObjectDetectionEvaluator(DetectionEvaluator):
+ """A class to evaluate detections."""
+
+ def __init__(
+ self,
+ categories,
+ matching_iou_threshold=0.5,
+ evaluate_corlocs=False,
+ metric_prefix=None,
+ use_weighted_mean_ap=False,
+ evaluate_masks=False,
+ ):
+ """Constructor.
+
+ Args:
+ categories: A list of dicts, each of which has the following keys -
+ 'id': (required) an integer id uniquely identifying this
+ category.
+ 'name': (required) string representing category name e.g.,
+ 'cat', 'dog'.
+ matching_iou_threshold: IOU threshold to use for matching
+ groundtruth boxes to detection boxes.
+ evaluate_corlocs: (optional) boolean which determines if corloc
+ scores are to be returned or not.
+ metric_prefix: (optional) string prefix for metric name; if None,
+ no prefix is used.
+ use_weighted_mean_ap: (optional) boolean which determines if the
+ mean average precision is computed directly from the scores and
+ tp_fp_labels of all classes.
+ evaluate_masks: If False, evaluation will be performed based on
+ boxes. If True, mask evaluation will be performed instead.
+
+ Raises:
+ ValueError: If the category ids are not 1-indexed.
+ """
+ super(ObjectDetectionEvaluator, self).__init__(categories)
+ self._num_classes = max([cat['id'] for cat in categories])
+ if min(cat['id'] for cat in categories) < 1:
+ raise ValueError('Classes should be 1-indexed.')
+ self._matching_iou_threshold = matching_iou_threshold
+ self._use_weighted_mean_ap = use_weighted_mean_ap
+ self._label_id_offset = 1
+ self._evaluate_masks = evaluate_masks
+ self._evaluation = ObjectDetectionEvaluation(
+ num_groundtruth_classes=self._num_classes,
+ matching_iou_threshold=self._matching_iou_threshold,
+ use_weighted_mean_ap=self._use_weighted_mean_ap,
+ label_id_offset=self._label_id_offset,
+ )
+ self._image_ids = set([])
+ self._evaluate_corlocs = evaluate_corlocs
+ self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
+
+ def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+ """Adds groundtruth for a single image to be used for evaluation.
+
+ Args:
+ image_id: A unique string/integer identifier for the image.
+ groundtruth_dict: A dictionary containing -
+ standard_fields.InputDataFields.groundtruth_boxes: float32
+ numpy array of shape [num_boxes, 4] containing `num_boxes`
+ groundtruth boxes of the format [ymin, xmin, ymax, xmax] in
+ absolute image coordinates.
+ standard_fields.InputDataFields.groundtruth_classes: integer
+ numpy array of shape [num_boxes] containing 1-indexed
+ groundtruth classes for the boxes.
+ standard_fields.InputDataFields.groundtruth_difficult: Optional
+ length M numpy boolean array denoting whether a ground
+ truth box is a difficult instance or not. This field is
+ optional to support the case that no boxes are difficult.
+ standard_fields.InputDataFields.groundtruth_instance_masks:
+ Optional numpy array of shape [num_boxes, height, width]
+ with values in {0, 1}.
+
+ Raises:
+ ValueError: On adding groundtruth for an image more than once. Will
+ also raise error if instance masks are not in groundtruth
+ dictionary.
+ """
+ if image_id in self._image_ids:
+ raise ValueError(
+ 'Image with id {} already added.'.format(image_id))
+
+ groundtruth_classes = (
+ groundtruth_dict[
+ standard_fields.InputDataFields.groundtruth_classes] -
+ self._label_id_offset)
+ # If the key is not present in the groundtruth_dict or the array is
+ # empty (unless there are no annotations for the groundtruth on this
+ # image) use values from the dictionary or insert None otherwise.
+ if (standard_fields.InputDataFields.groundtruth_difficult
+ in groundtruth_dict.keys()) and (groundtruth_dict[
+ standard_fields.InputDataFields.groundtruth_difficult].size
+ or
+ not groundtruth_classes.size):
+ groundtruth_difficult = groundtruth_dict[
+ standard_fields.InputDataFields.groundtruth_difficult]
+ else:
+ groundtruth_difficult = None
+ if not len(self._image_ids) % 1000:
+ logging.warn(('image %s does not have groundtruth difficult '
+ 'flag specified'), image_id)
+ groundtruth_masks = None
+ if self._evaluate_masks:
+ if (standard_fields.InputDataFields.groundtruth_instance_masks
+ not in groundtruth_dict):
+ raise ValueError(
+ 'Instance masks not in groundtruth dictionary.')
+ groundtruth_masks = groundtruth_dict[
+ standard_fields.InputDataFields.groundtruth_instance_masks]
+ self._evaluation.add_single_ground_truth_image_info(
+ image_key=image_id,
+ groundtruth_boxes=groundtruth_dict[
+ standard_fields.InputDataFields.groundtruth_boxes],
+ groundtruth_class_labels=groundtruth_classes,
+ groundtruth_is_difficult_list=groundtruth_difficult,
+ groundtruth_masks=groundtruth_masks,
+ )
+ self._image_ids.update([image_id])
+
+ def add_single_detected_image_info(self, image_id, detections_dict):
+ """Adds detections for a single image to be used for evaluation.
+
+ Args:
+ image_id: A unique string/integer identifier for the image.
+ detections_dict: A dictionary containing -
+ standard_fields.DetectionResultFields.detection_boxes: float32
+ numpy array of shape [num_boxes, 4] containing `num_boxes`
+ detection boxes of the format [ymin, xmin, ymax, xmax] in
+ absolute image coordinates.
+ standard_fields.DetectionResultFields.detection_scores: float32
+ numpy array of shape [num_boxes] containing detection
+ scores for the boxes.
+ standard_fields.DetectionResultFields.detection_classes:
+ integer numpy array of shape [num_boxes] containing
+ 1-indexed detection classes for the boxes.
+ standard_fields.DetectionResultFields.detection_masks: uint8
+ numpy array of shape [num_boxes, height, width] containing
+ `num_boxes` masks of values ranging between 0 and 1.
+
+ Raises:
+ ValueError: If detection masks are not in detections dictionary.
+ """
+ detection_classes = (
+ detections_dict[
+ standard_fields.DetectionResultFields.detection_classes] -
+ self._label_id_offset)
+ detection_masks = None
+ if self._evaluate_masks:
+ if (standard_fields.DetectionResultFields.detection_masks
+ not in detections_dict):
+ raise ValueError(
+ 'Detection masks not in detections dictionary.')
+ detection_masks = detections_dict[
+ standard_fields.DetectionResultFields.detection_masks]
+ self._evaluation.add_single_detected_image_info(
+ image_key=image_id,
+ detected_boxes=detections_dict[
+ standard_fields.DetectionResultFields.detection_boxes],
+ detected_scores=detections_dict[
+ standard_fields.DetectionResultFields.detection_scores],
+ detected_class_labels=detection_classes,
+ detected_masks=detection_masks,
+ )
+
+ def create_category_index(self, categories):
+ """Creates dictionary of COCO compatible categories keyed by category
+ id.
+
+ Args:
+ categories: a list of dicts, each of which has the following keys:
+ 'id': (required) an integer id uniquely identifying this
+ category.
+ 'name': (required) string representing category name
+ e.g., 'cat', 'dog', 'pizza'.
+
+ Returns:
+ category_index: a dict containing the same entries as categories,
+ but keyed by the 'id' field of each category.
+ """
+ category_index = {}
+ for cat in categories:
+ category_index[cat['id']] = cat
+ return category_index
+
+ def evaluate(self):
+ """Compute evaluation result.
+
+ Returns:
+ A dictionary of metrics with the following fields -
+
+ 1. summary_metrics:
+ 'Precision/mAP@IOU': mean average
+ precision at the specified IOU threshold
+
+ 2. per_category_ap: category specific results with keys of the form
+ 'PerformanceByCategory/mAP@IOU/category'
+ """
+ (
+ per_class_ap,
+ mean_ap,
+ _,
+ _,
+ per_class_corloc,
+ mean_corloc,
+ ) = self._evaluation.evaluate()
+
+ metric = f'mAP@{self._matching_iou_threshold}IOU'
+ pascal_metrics = {self._metric_prefix + metric: mean_ap}
+ if self._evaluate_corlocs:
+ pascal_metrics[self._metric_prefix +
+ 'Precision/meanCorLoc@{}IOU'.format(
+ self._matching_iou_threshold)] = mean_corloc
+ category_index = self.create_category_index(self._categories)
+ for idx in range(per_class_ap.size):
+ if idx + self._label_id_offset in category_index:
+ display_name = (
+ self._metric_prefix +
+ 'PerformanceByCategory/AP@{}IOU/{}'.format(
+ self._matching_iou_threshold,
+ category_index[idx + self._label_id_offset]['name'],
+ ))
+ pascal_metrics[display_name] = per_class_ap[idx]
+
+ # Optionally add CorLoc metrics.classes
+ if self._evaluate_corlocs: #False
+ display_name = (
+ self._metric_prefix +
+ 'PerformanceByCategory/CorLoc@{}IOU/{}'.format(
+ self._matching_iou_threshold,
+ category_index[idx +
+ self._label_id_offset]['name'],
+ ))
+ pascal_metrics[display_name] = per_class_corloc[idx]
+
+ return pascal_metrics
+
+ def clear(self):
+ """Clears the state to prepare for a fresh evaluation."""
+ self._evaluation = ObjectDetectionEvaluation(
+ num_groundtruth_classes=self._num_classes,
+ matching_iou_threshold=self._matching_iou_threshold,
+ use_weighted_mean_ap=self._use_weighted_mean_ap,
+ label_id_offset=self._label_id_offset,
+ )
+ self._image_ids.clear()
+
+
+class PascalDetectionEvaluator(ObjectDetectionEvaluator):
+ """A class to evaluate detections using PASCAL metrics."""
+
+ def __init__(self, categories, matching_iou_threshold=0.5):
+ super(PascalDetectionEvaluator, self).__init__(
+ categories,
+ matching_iou_threshold=matching_iou_threshold,
+ evaluate_corlocs=False,
+ use_weighted_mean_ap=False,
+ )
+
+
+ObjectDetectionEvalMetrics = collections.namedtuple(
+ 'ObjectDetectionEvalMetrics',
+ [
+ 'average_precisions',
+ 'mean_ap',
+ 'precisions',
+ 'recalls',
+ 'corlocs',
+ 'mean_corloc',
+ ],
+)
+
+
+class ObjectDetectionEvaluation:
+ """Internal implementation of Pascal object detection metrics."""
+
+ def __init__(
+ self,
+ num_groundtruth_classes,
+ matching_iou_threshold=0.5,
+ nms_iou_threshold=1.0,
+ nms_max_output_boxes=10000,
+ use_weighted_mean_ap=False,
+ label_id_offset=0,
+ ):
+ if num_groundtruth_classes < 1:
+ raise ValueError(
+ 'Need at least 1 groundtruth class for evaluation.')
+
+ self.per_image_eval = per_image_evaluation.PerImageEvaluation(
+ num_groundtruth_classes=num_groundtruth_classes,
+ matching_iou_threshold=matching_iou_threshold,
+ )
+ self.num_class = num_groundtruth_classes
+ self.use_weighted_mean_ap = use_weighted_mean_ap
+ self.label_id_offset = label_id_offset
+
+ self.groundtruth_boxes = {}
+ self.groundtruth_class_labels = {}
+ self.groundtruth_masks = {}
+ self.groundtruth_is_difficult_list = {}
+ self.groundtruth_is_group_of_list = {}
+ self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)
+ self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)
+
+ self._initialize_detections()
+
+ def _initialize_detections(self):
+ self.detection_keys = set()
+ self.scores_per_class = [[] for _ in range(self.num_class)]
+ self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
+ self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
+ self.average_precision_per_class = np.empty(
+ self.num_class, dtype=float)
+ self.average_precision_per_class.fill(np.nan)
+ self.precisions_per_class = []
+ self.recalls_per_class = []
+ self.corloc_per_class = np.ones(self.num_class, dtype=float)
+
+ def clear_detections(self):
+ self._initialize_detections()
+
+ def add_single_ground_truth_image_info(
+ self,
+ image_key,
+ groundtruth_boxes,
+ groundtruth_class_labels,
+ groundtruth_is_difficult_list=None,
+ groundtruth_is_group_of_list=None,
+ groundtruth_masks=None,
+ ):
+ """Adds groundtruth for a single image to be used for evaluation.
+
+ Args:
+ image_key: A unique string/integer identifier for the image.
+ groundtruth_boxes: float32 numpy array of shape [num_boxes, 4]
+ containing `num_boxes` groundtruth boxes of the format
+ [ymin, xmin, ymax, xmax] in absolute image coordinates.
+ groundtruth_class_labels: integer numpy array of shape [num_boxes]
+ containing 0-indexed groundtruth classes for the boxes.
+ groundtruth_is_difficult_list: A length M numpy boolean array
+ denoting whether a ground truth box is a difficult instance or
+ not. To support the case that no boxes are difficult, it is by
+ default set as None.
+ groundtruth_is_group_of_list: A length M numpy boolean array
+ denoting whether a ground truth box is a group-of box or not.
+ To support the case that no boxes are groups-of, it is by
+ default set as None.
+ groundtruth_masks: uint8 numpy array of shape
+ [num_boxes, height, width] containing `num_boxes` groundtruth
+ masks. The mask values range from 0 to 1.
+ """
+ if image_key in self.groundtruth_boxes:
+ logging.warn(('image %s has already been added to the ground '
+ 'truth database.'), image_key)
+ return
+
+ self.groundtruth_boxes[image_key] = groundtruth_boxes
+ self.groundtruth_class_labels[image_key] = groundtruth_class_labels
+ self.groundtruth_masks[image_key] = groundtruth_masks
+ if groundtruth_is_difficult_list is None:
+ num_boxes = groundtruth_boxes.shape[0]
+ groundtruth_is_difficult_list = np.zeros(num_boxes, dtype=bool)
+ self.groundtruth_is_difficult_list[
+ image_key] = groundtruth_is_difficult_list.astype(dtype=bool)
+ if groundtruth_is_group_of_list is None:
+ num_boxes = groundtruth_boxes.shape[0]
+ groundtruth_is_group_of_list = np.zeros(num_boxes, dtype=bool)
+ self.groundtruth_is_group_of_list[
+ image_key] = groundtruth_is_group_of_list.astype(dtype=bool)
+
+ self._update_ground_truth_statistics(
+ groundtruth_class_labels,
+ groundtruth_is_difficult_list.astype(dtype=bool),
+ groundtruth_is_group_of_list.astype(dtype=bool),
+ )
+
+ def add_single_detected_image_info(
+ self,
+ image_key,
+ detected_boxes,
+ detected_scores,
+ detected_class_labels,
+ detected_masks=None,
+ ):
+ """Adds detections for a single image to be used for evaluation.
+
+ Args:
+ image_key: A unique string/integer identifier for the image.
+ detected_boxes: float32 numpy array of shape [num_boxes, 4]
+ containing `num_boxes` detection boxes of the format
+ [ymin, xmin, ymax, xmax] in absolute image coordinates.
+ detected_scores: float32 numpy array of shape [num_boxes]
+ containing detection scores for the boxes.
+ detected_class_labels: integer numpy array of shape [num_boxes]
+ containing 0-indexed detection classes for the boxes.
+ detected_masks: np.uint8 numpy array of shape
+ [num_boxes, height, width] containing `num_boxes` detection
+ masks with values ranging between 0 and 1.
+
+ Raises:
+ ValueError: if the number of boxes, scores and class labels differ
+ in length.
+ """
+ if len(detected_boxes) != len(detected_scores) or len(
+ detected_boxes) != len(detected_class_labels):
+ raise ValueError(
+ 'detected_boxes, detected_scores and '
+ 'detected_class_labels should all have same lengths. Got'
+ '[%d, %d, %d]' % len(detected_boxes),
+ len(detected_scores),
+ len(detected_class_labels),
+ )
+
+ if image_key in self.detection_keys:
+ logging.warn(('image %s has already been added to the ground '
+ 'truth database.'), image_key)
+ return
+
+ self.detection_keys.add(image_key)
+ if image_key in self.groundtruth_boxes:
+ groundtruth_boxes = self.groundtruth_boxes[image_key]
+ groundtruth_class_labels = self.groundtruth_class_labels[image_key]
+ # Masks are popped instead of look up. The reason is that we do not
+ # want to keep all masks in memory which can cause memory overflow.
+ groundtruth_masks = self.groundtruth_masks.pop(image_key)
+ groundtruth_is_difficult_list = self.groundtruth_is_difficult_list[
+ image_key]
+ groundtruth_is_group_of_list = self.groundtruth_is_group_of_list[
+ image_key]
+ else:
+ groundtruth_boxes = np.empty(shape=[0, 4], dtype=float)
+ groundtruth_class_labels = np.array([], dtype=int)
+ if detected_masks is None:
+ groundtruth_masks = None
+ else:
+ groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float)
+ groundtruth_is_difficult_list = np.array([], dtype=bool)
+ groundtruth_is_group_of_list = np.array([], dtype=bool)
+ (
+ scores,
+ tp_fp_labels,
+ ) = self.per_image_eval.compute_object_detection_metrics(
+ detected_boxes=detected_boxes,
+ detected_scores=detected_scores,
+ detected_class_labels=detected_class_labels,
+ groundtruth_boxes=groundtruth_boxes,
+ groundtruth_class_labels=groundtruth_class_labels,
+ groundtruth_is_difficult_list=groundtruth_is_difficult_list,
+ groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+ detected_masks=detected_masks,
+ groundtruth_masks=groundtruth_masks,
+ )
+
+ for i in range(self.num_class):
+ if scores[i].shape[0] > 0:
+ self.scores_per_class[i].append(scores[i])
+ self.tp_fp_labels_per_class[i].append(tp_fp_labels[i])
+
+ def _update_ground_truth_statistics(
+ self,
+ groundtruth_class_labels,
+ groundtruth_is_difficult_list,
+ groundtruth_is_group_of_list,
+ ):
+ """Update grouth truth statitistics.
+
+ 1. Difficult boxes are ignored when counting the number of ground truth
+ instances as done in Pascal VOC devkit.
+ 2. Difficult boxes are treated as normal boxes when computing CorLoc
+ related statitistics.
+
+ Args:
+ groundtruth_class_labels: An integer numpy array of length M,
+ representing M class labels of object instances in ground truth
+ groundtruth_is_difficult_list: A boolean numpy array of length M
+ denoting whether a ground truth box is a difficult instance or
+ not
+ groundtruth_is_group_of_list: A boolean numpy array of length M
+ denoting whether a ground truth box is a group-of box or not
+ """
+ for class_index in range(self.num_class):
+ num_gt_instances = np.sum(groundtruth_class_labels[
+ ~groundtruth_is_difficult_list
+ & ~groundtruth_is_group_of_list] == class_index)
+ self.num_gt_instances_per_class[class_index] += num_gt_instances
+ if np.any(groundtruth_class_labels == class_index):
+ self.num_gt_imgs_per_class[class_index] += 1
+
+ def evaluate(self):
+ """Compute evaluation result.
+
+ Returns:
+ A named tuple with the following fields -
+ average_precision: float numpy array of average precision for
+ each class.
+ mean_ap: mean average precision of all classes, float scalar
+ precisions: List of precisions, each precision is a float numpy
+ array
+ recalls: List of recalls, each recall is a float numpy array
+ corloc: numpy float array
+ mean_corloc: Mean CorLoc score for each class, float scalar
+ """
+ if (self.num_gt_instances_per_class == 0).any():
+ print(
+ 'The following classes have no ground truth examples: %s',
+ np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +
+ self.label_id_offset, "self.detection_keys:",self.detection_keys
+ )
+
+ if self.use_weighted_mean_ap:
+ all_scores = np.array([], dtype=float)
+ all_tp_fp_labels = np.array([], dtype=bool)
+
+ for class_index in range(self.num_class):
+ if self.num_gt_instances_per_class[class_index] == 0:
+ continue
+
+ if not self.scores_per_class[class_index]:
+ scores = np.array([], dtype=float)
+ tp_fp_labels = np.array([], dtype=bool)
+ else:
+ scores = np.concatenate(self.scores_per_class[class_index])
+ tp_fp_labels = np.concatenate(
+ self.tp_fp_labels_per_class[class_index])
+ if self.use_weighted_mean_ap:
+ all_scores = np.append(all_scores, scores)
+ all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
+ precision, recall = metrics.compute_precision_recall(
+ scores,
+ tp_fp_labels,
+ self.num_gt_instances_per_class[class_index],
+ )
+ self.precisions_per_class.append(precision)
+ self.recalls_per_class.append(recall)
+ average_precision = metrics.compute_average_precision(
+ precision, recall)
+ self.average_precision_per_class[class_index] = average_precision
+
+ self.corloc_per_class = metrics.compute_cor_loc(
+ self.num_gt_imgs_per_class,
+ self.num_images_correctly_detected_per_class,
+ )
+
+ if self.use_weighted_mean_ap:
+ num_gt_instances = np.sum(self.num_gt_instances_per_class)
+ precision, recall = metrics.compute_precision_recall(
+ all_scores, all_tp_fp_labels, num_gt_instances)
+ mean_ap = metrics.compute_average_precision(precision, recall)
+ else:
+ mean_ap = np.nanmean(self.average_precision_per_class)
+ mean_corloc = np.nanmean(self.corloc_per_class)
+ return ObjectDetectionEvalMetrics(
+ self.average_precision_per_class,
+ mean_ap,
+ self.precisions_per_class,
+ self.recalls_per_class,
+ self.corloc_per_class,
+ mean_corloc,
+ )
diff --git a/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py b/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3013ae7ce2fb84ec3733474fdeff61a8d3ba20a8
--- /dev/null
+++ b/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py
@@ -0,0 +1,452 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Evaluate Object Detection result on a single image.
+
+Annotate each detected result as true positives or false positive according to
+a predefined IOU ratio. Non Maximum Supression is used by default. Multi class
+detection is supported by default. Based on the settings, per image evaluation
+is either performed on boxes or on object masks.
+"""
+
+import numpy as np
+
+from . import np_box_list, np_box_ops
+
+
+class PerImageEvaluation:
+ """Evaluate detection result of a single image."""
+
+ def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):
+ """Initialized PerImageEvaluation by evaluation parameters.
+
+ Args:
+ num_groundtruth_classes: Number of ground truth object classes
+ matching_iou_threshold: A ratio of area intersection to union,
+ which is the threshold to consider whether a detection is true
+ positive or not
+ """
+ self.matching_iou_threshold = matching_iou_threshold
+ self.num_groundtruth_classes = num_groundtruth_classes
+
+ def compute_object_detection_metrics(
+ self,
+ detected_boxes,
+ detected_scores,
+ detected_class_labels,
+ groundtruth_boxes,
+ groundtruth_class_labels,
+ groundtruth_is_difficult_list,
+ groundtruth_is_group_of_list,
+ detected_masks=None,
+ groundtruth_masks=None,
+ ):
+ """Evaluates detections as being tp, fp or ignored from a single image.
+
+ The evaluation is done in two stages:
+ 1. All detections are matched to non group-of boxes; true positives
+ are determined and detections matched to difficult boxes are
+ ignored.
+ 2. Detections that are determined as false positives are matched
+ against group-of boxes and ignored if matched.
+
+ Args:
+ detected_boxes: A float numpy array of shape [N, 4], representing N
+ regions of detected object regions.
+ Each row is of the format [y_min, x_min, y_max, x_max]
+ detected_scores: A float numpy array of shape [N, 1], representing
+ the confidence scores of the detected N object instances.
+ detected_class_labels: A integer numpy array of shape [N, 1],
+ repreneting the class labels of the detected N object
+ instances.
+ groundtruth_boxes: A float numpy array of shape [M, 4],
+ representing M regions of object instances in ground truth
+ groundtruth_class_labels: An integer numpy array of shape [M, 1],
+ representing M class labels of object instances in ground truth
+ groundtruth_is_difficult_list: A boolean numpy array of length M
+ denoting whether a ground truth box is a difficult instance or
+ not
+ groundtruth_is_group_of_list: A boolean numpy array of length M
+ denoting whether a ground truth box has group-of tag
+ detected_masks: (optional) A uint8 numpy array of shape
+ [N, height, width]. If not None, the metrics will be computed
+ based on masks.
+ groundtruth_masks: (optional) A uint8 numpy array of shape
+ [M, height, width].
+
+ Returns:
+ scores: A list of C float numpy arrays. Each numpy array is of
+ shape [K, 1], representing K scores detected with object class
+ label c
+ tp_fp_labels: A list of C boolean numpy arrays. Each numpy array
+ is of shape [K, 1], representing K True/False positive label of
+ object instances detected with class label c
+ """
+ (
+ detected_boxes,
+ detected_scores,
+ detected_class_labels,
+ detected_masks,
+ ) = self._remove_invalid_boxes(
+ detected_boxes,
+ detected_scores,
+ detected_class_labels,
+ detected_masks,
+ )
+ scores, tp_fp_labels = self._compute_tp_fp(
+ detected_boxes=detected_boxes,
+ detected_scores=detected_scores,
+ detected_class_labels=detected_class_labels,
+ groundtruth_boxes=groundtruth_boxes,
+ groundtruth_class_labels=groundtruth_class_labels,
+ groundtruth_is_difficult_list=groundtruth_is_difficult_list,
+ groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+ detected_masks=detected_masks,
+ groundtruth_masks=groundtruth_masks,
+ )
+
+ return scores, tp_fp_labels
+
+ def _compute_tp_fp(
+ self,
+ detected_boxes,
+ detected_scores,
+ detected_class_labels,
+ groundtruth_boxes,
+ groundtruth_class_labels,
+ groundtruth_is_difficult_list,
+ groundtruth_is_group_of_list,
+ detected_masks=None,
+ groundtruth_masks=None,
+ ):
+ """Labels true/false positives of detections of an image across all
+ classes.
+
+ Args:
+ detected_boxes: A float numpy array of shape [N, 4], representing N
+ regions of detected object regions.
+ Each row is of the format [y_min, x_min, y_max, x_max]
+ detected_scores: A float numpy array of shape [N, 1], representing
+ the confidence scores of the detected N object instances.
+ detected_class_labels: A integer numpy array of shape [N, 1],
+ repreneting the class labels of the detected N object
+ instances.
+ groundtruth_boxes: A float numpy array of shape [M, 4],
+ representing M regions of object instances in ground truth
+ groundtruth_class_labels: An integer numpy array of shape [M, 1],
+ representing M class labels of object instances in ground truth
+ groundtruth_is_difficult_list: A boolean numpy array of length M
+ denoting whether a ground truth box is a difficult instance or
+ not
+ groundtruth_is_group_of_list: A boolean numpy array of length M
+ denoting whether a ground truth box has group-of tag
+ detected_masks: (optional) A np.uint8 numpy array of shape
+ [N, height, width]. If not None, the scores will be computed
+ based on masks.
+ groundtruth_masks: (optional) A np.uint8 numpy array of shape
+ [M, height, width].
+
+ Returns:
+ result_scores: A list of float numpy arrays. Each numpy array is of
+ shape [K, 1], representing K scores detected with object class
+ label c
+ result_tp_fp_labels: A list of boolean numpy array. Each numpy
+ array is of shape [K, 1], representing K True/False positive
+ label of object instances detected with class label c
+
+ Raises:
+ ValueError: If detected masks is not None but groundtruth masks are
+ None, or the other way around.
+ """
+ if detected_masks is not None and groundtruth_masks is None:
+ raise ValueError(
+ 'Detected masks is available but groundtruth masks is not.')
+ if detected_masks is None and groundtruth_masks is not None:
+ raise ValueError(
+ 'Groundtruth masks is available but detected masks is not.')
+
+ result_scores = []
+ result_tp_fp_labels = []
+ for i in range(self.num_groundtruth_classes):
+ groundtruth_is_difficult_list_at_ith_class = (
+ groundtruth_is_difficult_list[groundtruth_class_labels == i])
+ groundtruth_is_group_of_list_at_ith_class = (
+ groundtruth_is_group_of_list[groundtruth_class_labels == i])
+ (
+ gt_boxes_at_ith_class,
+ gt_masks_at_ith_class,
+ detected_boxes_at_ith_class,
+ detected_scores_at_ith_class,
+ detected_masks_at_ith_class,
+ ) = self._get_ith_class_arrays(detected_boxes, detected_scores,
+ detected_masks,
+ detected_class_labels,
+ groundtruth_boxes,
+ groundtruth_masks,
+ groundtruth_class_labels, i)
+ scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
+ detected_boxes=detected_boxes_at_ith_class,
+ detected_scores=detected_scores_at_ith_class,
+ groundtruth_boxes=gt_boxes_at_ith_class,
+ groundtruth_is_difficult_list=(
+ groundtruth_is_difficult_list_at_ith_class),
+ groundtruth_is_group_of_list=(
+ groundtruth_is_group_of_list_at_ith_class),
+ detected_masks=detected_masks_at_ith_class,
+ groundtruth_masks=gt_masks_at_ith_class,
+ )
+ result_scores.append(scores)
+ result_tp_fp_labels.append(tp_fp_labels)
+ return result_scores, result_tp_fp_labels
+
+ def _get_overlaps_and_scores_box_mode(
+ self,
+ detected_boxes,
+ detected_scores,
+ groundtruth_boxes,
+ groundtruth_is_group_of_list,
+ ):
+ """Computes overlaps and scores between detected and groudntruth boxes.
+
+ Args:
+ detected_boxes: A numpy array of shape [N, 4] representing detected
+ box coordinates
+ detected_scores: A 1-d numpy array of length N representing
+ classification score
+ groundtruth_boxes: A numpy array of shape [M, 4] representing
+ ground truth box coordinates
+ groundtruth_is_group_of_list: A boolean numpy array of length M
+ denoting whether a ground truth box has group-of tag. If a
+ groundtruth box is group-of box, every detection matching this
+ box is ignored.
+
+ Returns:
+ iou: A float numpy array of size [num_detected_boxes,
+ num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it
+ will be None.
+ ioa: A float numpy array of size [num_detected_boxes,
+ num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will
+ be None.
+ scores: The score of the detected boxlist.
+ num_boxes: Number of non-maximum suppressed detected boxes.
+ """
+ detected_boxlist = np_box_list.BoxList(detected_boxes)
+ detected_boxlist.add_field('scores', detected_scores)
+ gt_non_group_of_boxlist = np_box_list.BoxList(
+ groundtruth_boxes[~groundtruth_is_group_of_list])
+
+ iou = np_box_ops.iou(detected_boxlist.get(),
+ gt_non_group_of_boxlist.get())
+ scores = detected_boxlist.get_field('scores')
+ num_boxes = detected_boxlist.num_boxes()
+ return iou, None, scores, num_boxes
+
+ def _compute_tp_fp_for_single_class(
+ self,
+ detected_boxes,
+ detected_scores,
+ groundtruth_boxes,
+ groundtruth_is_difficult_list,
+ groundtruth_is_group_of_list,
+ detected_masks=None,
+ groundtruth_masks=None,
+ ):
+ """Labels boxes detected with the same class from the same image as
+ tp/fp.
+
+ Args:
+ detected_boxes: A numpy array of shape [N, 4] representing detected
+ box coordinates
+ detected_scores: A 1-d numpy array of length N representing
+ classification score
+ groundtruth_boxes: A numpy array of shape [M, 4] representing
+ groundtruth box coordinates
+ groundtruth_is_difficult_list: A boolean numpy array of length M
+ denoting whether a ground truth box is a difficult instance or
+ not. If a groundtruth box is difficult, every detection
+ matching this box is ignored.
+ groundtruth_is_group_of_list: A boolean numpy array of length M
+ denoting whether a ground truth box has group-of tag. If a
+ groundtruth box is group-of box, every detection matching this
+ box is ignored.
+ detected_masks: (optional) A uint8 numpy array of shape
+ [N, height, width]. If not None, the scores will be computed
+ based on masks.
+ groundtruth_masks: (optional) A uint8 numpy array of shape
+ [M, height, width].
+
+ Returns:
+ Two arrays of the same size, containing all boxes that were
+ evaluated as being true positives or false positives; if a box
+ matched to a difficult box or to a group-of box, it is ignored.
+
+ scores: A numpy array representing the detection scores.
+ tp_fp_labels: a boolean numpy array indicating whether a detection
+ is a true positive.
+ """
+ if detected_boxes.size == 0:
+ return np.array([], dtype=float), np.array([], dtype=bool)
+
+ (
+ iou,
+ _,
+ scores,
+ num_detected_boxes,
+ ) = self._get_overlaps_and_scores_box_mode(
+ detected_boxes=detected_boxes,
+ detected_scores=detected_scores,
+ groundtruth_boxes=groundtruth_boxes,
+ groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+ )
+
+ if groundtruth_boxes.size == 0:
+ return scores, np.zeros(num_detected_boxes, dtype=bool)
+
+ tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)
+ is_matched_to_difficult_box = np.zeros(num_detected_boxes, dtype=bool)
+ is_matched_to_group_of_box = np.zeros(num_detected_boxes, dtype=bool)
+
+ # The evaluation is done in two stages:
+ # 1. All detections are matched to non group-of boxes; true positives
+ # are determined and detections matched to difficult boxes are
+ # ignored.
+ # 2. Detections that are determined as false positives are matched
+ # against group-of boxes and ignored if matched.
+
+ # Tp-fp evaluation for non-group of boxes (if any).
+ if iou.shape[1] > 0:
+ groundtruth_nongroup_of_is_difficult_list = (
+ groundtruth_is_difficult_list[~groundtruth_is_group_of_list])
+ max_overlap_gt_ids = np.argmax(iou, axis=1)
+ is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
+ for i in range(num_detected_boxes):
+ gt_id = max_overlap_gt_ids[i]
+ if iou[i, gt_id] >= self.matching_iou_threshold:
+ if not groundtruth_nongroup_of_is_difficult_list[gt_id]:
+ if not is_gt_box_detected[gt_id]:
+ tp_fp_labels[i] = True
+ is_gt_box_detected[gt_id] = True
+ else:
+ is_matched_to_difficult_box[i] = True
+
+ return (
+ scores[~is_matched_to_difficult_box & ~is_matched_to_group_of_box],
+ tp_fp_labels[~is_matched_to_difficult_box
+ & ~is_matched_to_group_of_box],
+ )
+
+ def _get_ith_class_arrays(
+ self,
+ detected_boxes,
+ detected_scores,
+ detected_masks,
+ detected_class_labels,
+ groundtruth_boxes,
+ groundtruth_masks,
+ groundtruth_class_labels,
+ class_index,
+ ):
+ """Returns numpy arrays belonging to class with index `class_index`.
+
+ Args:
+ detected_boxes: A numpy array containing detected boxes.
+ detected_scores: A numpy array containing detected scores.
+ detected_masks: A numpy array containing detected masks.
+ detected_class_labels: A numpy array containing detected class
+ labels.
+ groundtruth_boxes: A numpy array containing groundtruth boxes.
+ groundtruth_masks: A numpy array containing groundtruth masks.
+ groundtruth_class_labels: A numpy array containing groundtruth
+ class labels.
+ class_index: An integer index.
+
+ Returns:
+ gt_boxes_at_ith_class: A numpy array containing groundtruth boxes
+ labeled as ith class.
+ gt_masks_at_ith_class: A numpy array containing groundtruth masks
+ labeled as ith class.
+ detected_boxes_at_ith_class: A numpy array containing detected
+ boxes corresponding to the ith class.
+ detected_scores_at_ith_class: A numpy array containing detected
+ scores corresponding to the ith class.
+ detected_masks_at_ith_class: A numpy array containing detected
+ masks corresponding to the ith class.
+ """
+ selected_groundtruth = groundtruth_class_labels == class_index
+ gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth]
+ if groundtruth_masks is not None:
+ gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth]
+ else:
+ gt_masks_at_ith_class = None
+ selected_detections = detected_class_labels == class_index
+ detected_boxes_at_ith_class = detected_boxes[selected_detections]
+ detected_scores_at_ith_class = detected_scores[selected_detections]
+ if detected_masks is not None:
+ detected_masks_at_ith_class = detected_masks[selected_detections]
+ else:
+ detected_masks_at_ith_class = None
+ return (
+ gt_boxes_at_ith_class,
+ gt_masks_at_ith_class,
+ detected_boxes_at_ith_class,
+ detected_scores_at_ith_class,
+ detected_masks_at_ith_class,
+ )
+
+ def _remove_invalid_boxes(
+ self,
+ detected_boxes,
+ detected_scores,
+ detected_class_labels,
+ detected_masks=None,
+ ):
+ """Removes entries with invalid boxes.
+
+ A box is invalid if either its xmax is smaller than its xmin, or its
+ ymax is smaller than its ymin.
+
+ Args:
+ detected_boxes: A float numpy array of size [num_boxes, 4]
+ containing box coordinates in [ymin, xmin, ymax, xmax] format.
+ detected_scores: A float numpy array of size [num_boxes].
+ detected_class_labels: A int32 numpy array of size [num_boxes].
+ detected_masks: A uint8 numpy array of size
+ [num_boxes, height, width].
+
+ Returns:
+ valid_detected_boxes: A float numpy array of size
+ [num_valid_boxes, 4] containing box coordinates in
+ [ymin, xmin, ymax, xmax] format.
+ valid_detected_scores: A float numpy array of size
+ [num_valid_boxes].
+ valid_detected_class_labels: A int32 numpy array of size
+ [num_valid_boxes].
+ valid_detected_masks: A uint8 numpy array of size
+ [num_valid_boxes, height, width].
+ """
+ valid_indices = np.logical_and(
+ detected_boxes[:, 0] < detected_boxes[:, 2],
+ detected_boxes[:, 1] < detected_boxes[:, 3],
+ )
+ detected_boxes = detected_boxes[valid_indices]
+ detected_scores = detected_scores[valid_indices]
+ detected_class_labels = detected_class_labels[valid_indices]
+ if detected_masks is not None:
+ detected_masks = detected_masks[valid_indices]
+ return [
+ detected_boxes,
+ detected_scores,
+ detected_class_labels,
+ detected_masks,
+ ]
diff --git a/paddlevideo/metrics/ava_evaluation/standard_fields.py b/paddlevideo/metrics/ava_evaluation/standard_fields.py
new file mode 100644
index 0000000000000000000000000000000000000000..8edf46d0816ab34458e5587b39b735c977f71572
--- /dev/null
+++ b/paddlevideo/metrics/ava_evaluation/standard_fields.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Contains classes specifying naming conventions used for object detection.
+
+Specifies:
+ InputDataFields: standard fields used by reader/preprocessor/batcher.
+ DetectionResultFields: standard fields returned by object detector.
+"""
+
+
+class InputDataFields:
+ """Names for the input tensors.
+
+ Holds the standard data field names to use for identifying input tensors.
+ This should be used by the decoder to identify keys for the returned
+ tensor_dict containing input tensors. And it should be used by the model to
+ identify the tensors it needs.
+
+ Attributes:
+ image: image.
+ original_image: image in the original input size.
+ key: unique key corresponding to image.
+ source_id: source of the original image.
+ filename: original filename of the dataset (without common path).
+ groundtruth_image_classes: image-level class labels.
+ groundtruth_boxes: coordinates of the ground truth boxes in the image.
+ groundtruth_classes: box-level class labels.
+ groundtruth_label_types: box-level label types (e.g. explicit
+ negative).
+ groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
+ is the groundtruth a single object or a crowd.
+ groundtruth_area: area of a groundtruth segment.
+ groundtruth_difficult: is a `difficult` object
+ groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of
+ the same class, forming a connected group, where instances are
+ heavily occluding each other.
+ proposal_boxes: coordinates of object proposal boxes.
+ proposal_objectness: objectness score of each proposal.
+ groundtruth_instance_masks: ground truth instance masks.
+ groundtruth_instance_boundaries: ground truth instance boundaries.
+ groundtruth_instance_classes: instance mask-level class labels.
+ groundtruth_keypoints: ground truth keypoints.
+ groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
+ groundtruth_label_scores: groundtruth label scores.
+ groundtruth_weights: groundtruth weight factor for bounding boxes.
+ num_groundtruth_boxes: number of groundtruth boxes.
+ true_image_shapes: true shapes of images in the resized images, as
+ resized images can be padded with zeros.
+ """
+
+ image = 'image'
+ original_image = 'original_image'
+ key = 'key'
+ source_id = 'source_id'
+ filename = 'filename'
+ groundtruth_image_classes = 'groundtruth_image_classes'
+ groundtruth_boxes = 'groundtruth_boxes'
+ groundtruth_classes = 'groundtruth_classes'
+ groundtruth_label_types = 'groundtruth_label_types'
+ groundtruth_is_crowd = 'groundtruth_is_crowd'
+ groundtruth_area = 'groundtruth_area'
+ groundtruth_difficult = 'groundtruth_difficult'
+ groundtruth_group_of = 'groundtruth_group_of'
+ proposal_boxes = 'proposal_boxes'
+ proposal_objectness = 'proposal_objectness'
+ groundtruth_instance_masks = 'groundtruth_instance_masks'
+ groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
+ groundtruth_instance_classes = 'groundtruth_instance_classes'
+ groundtruth_keypoints = 'groundtruth_keypoints'
+ groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
+ groundtruth_label_scores = 'groundtruth_label_scores'
+ groundtruth_weights = 'groundtruth_weights'
+ num_groundtruth_boxes = 'num_groundtruth_boxes'
+ true_image_shape = 'true_image_shape'
+
+
+class DetectionResultFields:
+ """Naming conventions for storing the output of the detector.
+
+ Attributes:
+ source_id: source of the original image.
+ key: unique key corresponding to image.
+ detection_boxes: coordinates of the detection boxes in the image.
+ detection_scores: detection scores for the detection boxes in the
+ image.
+ detection_classes: detection-level class labels.
+ detection_masks: contains a segmentation mask for each detection box.
+ detection_boundaries: contains an object boundary for each detection
+ box.
+ detection_keypoints: contains detection keypoints for each detection
+ box.
+ num_detections: number of detections in the batch.
+ """
+
+ source_id = 'source_id'
+ key = 'key'
+ detection_boxes = 'detection_boxes'
+ detection_scores = 'detection_scores'
+ detection_classes = 'detection_classes'
+ detection_masks = 'detection_masks'
+ detection_boundaries = 'detection_boundaries'
+ detection_keypoints = 'detection_keypoints'
+ num_detections = 'num_detections'
diff --git a/paddlevideo/metrics/ava_metric.py b/paddlevideo/metrics/ava_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ee21bdb6e8d8c10fb9cd579d02677ee027d7683
--- /dev/null
+++ b/paddlevideo/metrics/ava_metric.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from paddle.hapi.model import _all_gather
+from collections import OrderedDict
+from paddlevideo.utils import get_logger, load, log_batch, AverageMeter
+from .registry import METRIC
+from .base import BaseMetric
+import time
+from datetime import datetime
+from .ava_utils import ava_evaluate_results
+
+logger = get_logger("paddlevideo")
+""" An example for metrics class.
+ MultiCropMetric for slowfast.
+"""
+
+
+@METRIC.register
+class AVAMetric(BaseMetric):
+
+ def __init__(self,
+ data_size,
+ batch_size,
+ file_path,
+ exclude_file,
+ label_file,
+ custom_classes,
+ log_interval=1):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+
+ self.file_path = file_path
+ self.exclude_file = exclude_file
+ self.label_file = label_file
+ self.custom_classes = custom_classes
+
+ self.results = []
+
+ record_list = [
+ ("loss", AverageMeter('loss', '7.5f')),
+ ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')),
+ ("prec@thr=0.5", AverageMeter("prec@thr=0.5", '.5f')),
+ ("recall@top3", AverageMeter("recall@top3", '.5f')),
+ ("prec@top3", AverageMeter("prec@top3", '.5f')),
+ ("recall@top5", AverageMeter("recall@top5", '.5f')),
+ ("prec@top5", AverageMeter("prec@top5", '.5f')),
+ ("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')),
+ ("batch_time", AverageMeter('batch_cost', '.5f')),
+ ("reader_time", AverageMeter('reader_cost', '.5f')),
+ ]
+
+ self.record_list = OrderedDict(record_list)
+
+ self.tic = time.time()
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+
+ self.results.extend(outputs)
+ self.record_list['batch_time'].update(time.time() - self.tic)
+ tic = time.time()
+ ips = "ips: {:.5f} instance/sec.".format(
+ self.batch_size / self.record_list["batch_time"].val)
+ log_batch(self.record_list, batch_id, 0, 0, "test", ips)
+
+ def set_dataset_info(self, info, dataset_len):
+ self.info = info
+ self.dataset_len = dataset_len
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ test_res = ava_evaluate_results(self.info, self.dataset_len,
+ self.results, None, self.label_file,
+ self.file_path, self.exclude_file)
+
+ for name, value in test_res.items():
+ self.record_list[name].update(value, self.batch_size)
+
+ return self.record_list
diff --git a/paddlevideo/metrics/ava_utils.py b/paddlevideo/metrics/ava_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b127267ede788f1dfa10cb17fc9523e55fdec2c0
--- /dev/null
+++ b/paddlevideo/metrics/ava_utils.py
@@ -0,0 +1,394 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import heapq
+import logging
+import time
+from collections import defaultdict
+from .ava_evaluation import object_detection_evaluation as det_eval
+from .ava_evaluation import standard_fields
+from .recall import eval_recalls
+import shutil
+import pickle
+import time
+import os
+import os.path as osp
+from paddlevideo.utils import get_logger, get_dist_info
+import paddle.distributed as dist
+import sys
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+import paddle
+
+
+def det2csv(info, dataset_len, results, custom_classes):
+ csv_results = []
+ for idx in range(dataset_len):
+ video_id = info[idx]['video_id']
+ timestamp = info[idx]['timestamp']
+
+ result = results[idx]
+ for label, _ in enumerate(result):
+ for bbox in result[label]:
+ if type(bbox) == paddle.Tensor:
+ bbox = bbox.numpy()
+
+ bbox_ = tuple(bbox.tolist())
+ if custom_classes is not None:
+ actual_label = custom_classes[label + 1]
+ else:
+ actual_label = label + 1
+ csv_results.append((
+ video_id,
+ timestamp,
+ ) + bbox_[:4] + (actual_label, ) + bbox_[4:])
+ return csv_results
+
+
+# results is organized by class
+def results2csv(info, dataset_len, results, out_file, custom_classes=None):
+ if isinstance(results[0], list):
+ csv_results = det2csv(info, dataset_len, results, custom_classes)
+
+ # save space for float
+ def tostr(item):
+ if isinstance(item, float):
+ return f'{item:.3f}'
+ return str(item)
+
+ with open(out_file, 'w') as f:
+ for csv_result in csv_results:
+ f.write(','.join(map(lambda x: tostr(x), csv_result)))
+ f.write('\n')
+
+
+def print_time(message, start):
+ print('==> %g seconds to %s' % (time.time() - start, message))
+
+
+def make_image_key(video_id, timestamp):
+ """Returns a unique identifier for a video id & timestamp."""
+ return f'{video_id},{int(timestamp):04d}'
+
+
+def read_csv(csv_file, class_whitelist=None, capacity=0):
+ """Loads boxes and class labels from a CSV file in the AVA format.
+
+ CSV file format described at https://research.google.com/ava/download.html.
+
+ Args:
+ csv_file: A file object.
+ class_whitelist: If provided, boxes corresponding to (integer) class
+ labels not in this set are skipped.
+ capacity: Maximum number of labeled boxes allowed for each example.
+ Default is 0 where there is no limit.
+
+ Returns:
+ boxes: A dictionary mapping each unique image key (string) to a list of
+ boxes, given as coordinates [y1, x1, y2, x2].
+ labels: A dictionary mapping each unique image key (string) to a list
+ of integer class lables, matching the corresponding box in `boxes`.
+ scores: A dictionary mapping each unique image key (string) to a list
+ of score values lables, matching the corresponding label in `labels`.
+ If scores are not provided in the csv, then they will default to 1.0.
+ """
+ start = time.time()
+ entries = defaultdict(list)
+ boxes = defaultdict(list)
+ labels = defaultdict(list)
+ scores = defaultdict(list)
+ reader = csv.reader(csv_file)
+ for row in reader:
+ assert len(row) in [7, 8], 'Wrong number of columns: ' + row
+ image_key = make_image_key(row[0], row[1])
+ x1, y1, x2, y2 = [float(n) for n in row[2:6]]
+ action_id = int(row[6])
+ if class_whitelist and action_id not in class_whitelist:
+ continue
+
+ score = 1.0
+ if len(row) == 8:
+ score = float(row[7])
+ if capacity < 1 or len(entries[image_key]) < capacity:
+ heapq.heappush(entries[image_key],
+ (score, action_id, y1, x1, y2, x2))
+ elif score > entries[image_key][0][0]:
+ heapq.heapreplace(entries[image_key],
+ (score, action_id, y1, x1, y2, x2))
+ for image_key in entries:
+ # Evaluation API assumes boxes with descending scores
+ entry = sorted(entries[image_key], key=lambda tup: -tup[0])
+ for item in entry:
+ score, action_id, y1, x1, y2, x2 = item
+ boxes[image_key].append([y1, x1, y2, x2])
+ labels[image_key].append(action_id)
+ scores[image_key].append(score)
+ print_time('read file ' + csv_file.name, start)
+ return boxes, labels, scores
+
+
+def read_exclusions(exclusions_file):
+ """Reads a CSV file of excluded timestamps.
+
+ Args:
+ exclusions_file: A file object containing a csv of video-id,timestamp.
+
+ Returns:
+ A set of strings containing excluded image keys, e.g.
+ "aaaaaaaaaaa,0904",
+ or an empty set if exclusions file is None.
+ """
+ excluded = set()
+ if exclusions_file:
+ reader = csv.reader(exclusions_file)
+ for row in reader:
+ assert len(row) == 2, 'Expected only 2 columns, got: ' + row
+ excluded.add(make_image_key(row[0], row[1]))
+ return excluded
+
+
+def read_labelmap(labelmap_file):
+ """Reads a labelmap without the dependency on protocol buffers.
+
+ Args:
+ labelmap_file: A file object containing a label map protocol buffer.
+
+ Returns:
+ labelmap: The label map in the form used by the
+ object_detection_evaluation
+ module - a list of {"id": integer, "name": classname } dicts.
+ class_ids: A set containing all of the valid class id integers.
+ """
+ labelmap = []
+ class_ids = set()
+ name = ''
+ class_id = ''
+ for line in labelmap_file:
+ if line.startswith(' name:'):
+ name = line.split('"')[1]
+ elif line.startswith(' id:') or line.startswith(' label_id:'):
+ class_id = int(line.strip().split(' ')[-1])
+ labelmap.append({'id': class_id, 'name': name})
+ class_ids.add(class_id)
+ return labelmap, class_ids
+
+
+# Seems there is at most 100 detections for each image
+def ava_eval(result_file,
+ result_type,
+ label_file,
+ ann_file,
+ exclude_file,
+ max_dets=(100, ),
+ verbose=True,
+ custom_classes=None):
+
+ assert result_type in ['mAP']
+ start = time.time()
+ categories, class_whitelist = read_labelmap(open(label_file))
+
+ if custom_classes is not None:
+ custom_classes = custom_classes[1:]
+ assert set(custom_classes).issubset(set(class_whitelist))
+ class_whitelist = custom_classes
+ categories = [cat for cat in categories if cat['id'] in custom_classes]
+
+ # loading gt, do not need gt score
+ gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist, 0)
+ if verbose:
+ print_time('Reading detection results', start)
+
+ if exclude_file is not None:
+ excluded_keys = read_exclusions(open(exclude_file))
+ else:
+ excluded_keys = list()
+
+ start = time.time()
+ boxes, labels, scores = read_csv(open(result_file), class_whitelist, 0)
+ if verbose:
+ print_time('Reading detection results', start)
+
+ if result_type == 'proposal':
+ gts = [
+ np.array(gt_boxes[image_key], dtype=float) for image_key in gt_boxes
+ ]
+ proposals = []
+ for image_key in gt_boxes:
+ if image_key in boxes:
+ proposals.append(
+ np.concatenate(
+ (np.array(boxes[image_key], dtype=float),
+ np.array(scores[image_key], dtype=float)[:, None]),
+ axis=1))
+ else:
+ # if no corresponding proposal, add a fake one
+ proposals.append(np.array([0, 0, 1, 1, 1]))
+
+ # Proposals used here are with scores
+ recalls = eval_recalls(gts, proposals, np.array(max_dets),
+ np.arange(0.5, 0.96, 0.05))
+ ar = recalls.mean(axis=1)
+ ret = {}
+ for i, num in enumerate(max_dets):
+ print(f'Recall@0.5@{num}\t={recalls[i, 0]:.4f}')
+ print(f'AR@{num}\t={ar[i]:.4f}')
+ ret[f'Recall@0.5@{num}'] = recalls[i, 0]
+ ret[f'AR@{num}'] = ar[i]
+ return ret
+
+ if result_type == 'mAP':
+ pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)
+
+ start = time.time()
+ for image_key in gt_boxes:
+ if verbose and image_key in excluded_keys:
+ logging.info(
+ 'Found excluded timestamp in detections: %s.'
+ 'It will be ignored.', image_key)
+ continue
+ pascal_evaluator.add_single_ground_truth_image_info(
+ image_key, {
+ standard_fields.InputDataFields.groundtruth_boxes:
+ np.array(gt_boxes[image_key], dtype=float),
+ standard_fields.InputDataFields.groundtruth_classes:
+ np.array(gt_labels[image_key], dtype=int),
+ standard_fields.InputDataFields.groundtruth_difficult:
+ np.zeros(len(gt_boxes[image_key]), dtype=bool)
+ })
+ if verbose:
+ print_time('Convert groundtruth', start)
+
+ start = time.time()
+ for image_key in boxes:
+ if verbose and image_key in excluded_keys:
+ logging.info(
+ 'Found excluded timestamp in detections: %s.'
+ 'It will be ignored.', image_key)
+ continue
+ pascal_evaluator.add_single_detected_image_info(
+ image_key, {
+ standard_fields.DetectionResultFields.detection_boxes:
+ np.array(boxes[image_key], dtype=float),
+ standard_fields.DetectionResultFields.detection_classes:
+ np.array(labels[image_key], dtype=int),
+ standard_fields.DetectionResultFields.detection_scores:
+ np.array(scores[image_key], dtype=float)
+ })
+ if verbose:
+ print_time('convert detections', start)
+
+ start = time.time()
+ metrics = pascal_evaluator.evaluate()
+ if verbose:
+ print_time('run_evaluator', start)
+ for display_name in metrics:
+ print(f'{display_name}=\t{metrics[display_name]}')
+ ret = {
+ display_name: metrics[display_name]
+ for display_name in metrics if 'ByCategory' not in display_name
+ }
+ return ret
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+ if dir_name == '':
+ return
+ dir_name = osp.expanduser(dir_name)
+ os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def dump_to_fileobj(obj, file, **kwargs):
+ kwargs.setdefault('protocol', 2)
+ pickle.dump(obj, file, **kwargs)
+
+
+def dump_to_path(obj, filepath, mode='wb'):
+ with open(filepath, mode) as f:
+ dump_to_fileobj(obj, f)
+
+
+def load_from_fileobj(file, **kwargs):
+ return pickle.load(file, **kwargs)
+
+
+def load_from_path(filepath, mode='rb'):
+ with open(filepath, mode) as f:
+ return load_from_fileobj(f)
+
+
+def collect_results_cpu(result_part, size):
+ """Collect results in cpu mode.
+ It saves the results on different gpus to 'tmpdir' and collects
+ them by the rank 0 worker.
+ """
+ tmpdir = osp.join('./', 'collect_results_cpu')
+ #1. load results of all parts from tmp dir
+ mkdir_or_exist(tmpdir)
+ rank, world_size = get_dist_info()
+ dump_to_path(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+ dist.barrier()
+ if rank != 0:
+ return None
+ #2. collect all parts
+ while 1:
+ all_exist = True
+ for i in range(world_size):
+ part_file = osp.join(tmpdir, f'part_{i}.pkl')
+ if not Path(part_file).exists():
+ all_exist = False
+ if all_exist:
+ break
+ else:
+ time.sleep(60)
+ time.sleep(120)
+ #3. load results of all parts from tmp dir
+ part_list = []
+ for i in range(world_size):
+ part_file = osp.join(tmpdir, f'part_{i}.pkl')
+ part_list.append(load_from_path(part_file))
+ #4. sort the results
+ ordered_results = []
+ for res in zip(*part_list):
+ ordered_results.extend(list(res))
+ ordered_results = ordered_results[:
+ size] #the dataloader may pad some samples
+ #5. remove results of all parts from tmp dir, avoid dump_file fail to tmp dir when dir not exists.
+ for i in range(world_size):
+ part_file = osp.join(tmpdir, f'part_{i}.pkl')
+ os.remove(part_file)
+
+ return ordered_results
+
+
+def ava_evaluate_results(info, dataset_len, results, custom_classes, label_file,
+ file_path, exclude_file):
+ # need to create a temp result file
+ time_now = datetime.now().strftime('%Y%m%d_%H%M%S')
+ temp_file = f'AVA_{time_now}_result.csv'
+ results2csv(info, dataset_len, results, temp_file)
+ ret = {}
+ eval_result = ava_eval(
+ temp_file,
+ 'mAP',
+ label_file,
+ file_path, #ann_file,
+ exclude_file,
+ custom_classes=custom_classes)
+ ret.update(eval_result)
+
+ os.remove(temp_file)
+
+ return ret
diff --git a/paddlevideo/metrics/base.py b/paddlevideo/metrics/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c42b72883183f982aafa8d4337ae13483dbc23b
--- /dev/null
+++ b/paddlevideo/metrics/base.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+import numpy as np
+import paddle
+from paddlevideo.utils import get_dist_info
+
+from .registry import METRIC
+
+
+class BaseMetric(object):
+ def __init__(self, data_size, batch_size, log_interval=1, **kwargs):
+ self.data_size = data_size
+ self.batch_size = batch_size
+ _, self.world_size = get_dist_info()
+ self.log_interval = log_interval
+
+ @abstractmethod
+ def update(self):
+ raise NotImplemented
+
+ @abstractmethod
+ def accumulate(self):
+ raise NotImplemented
diff --git a/paddlevideo/metrics/bmn_metric.py b/paddlevideo/metrics/bmn_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc36283f98a8c66ab7e0f14e47086d341c5c77e2
--- /dev/null
+++ b/paddlevideo/metrics/bmn_metric.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import json
+import numpy as np
+import pandas as pd
+import multiprocessing as mp
+
+from .registry import METRIC
+from .base import BaseMetric
+from .ActivityNet import ANETproposal
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
+ """Compute jaccard score between a box and the anchors.
+ """
+ len_anchors = anchors_max - anchors_min
+ int_xmin = np.maximum(anchors_min, box_min)
+ int_xmax = np.minimum(anchors_max, box_max)
+ inter_len = np.maximum(int_xmax - int_xmin, 0.)
+ union_len = len_anchors - inter_len + box_max - box_min
+ jaccard = np.divide(inter_len, union_len)
+ return jaccard
+
+
+def boundary_choose(score_list):
+ """Choose start and end boundary from score.
+ """
+ max_score = max(score_list)
+ mask_high = (score_list > max_score * 0.5)
+ score_list = list(score_list)
+ score_middle = np.array([0.0] + score_list + [0.0])
+ score_front = np.array([0.0, 0.0] + score_list)
+ score_back = np.array(score_list + [0.0, 0.0])
+ mask_peak = ((score_middle > score_front) & (score_middle > score_back))
+ mask_peak = mask_peak[1:-1]
+ mask = (mask_high | mask_peak).astype('float32')
+ return mask
+
+
+def soft_nms(df, alpha, t1, t2):
+ '''
+ df: proposals generated by network;
+ alpha: alpha value of Gaussian decaying function;
+ t1, t2: threshold for soft nms.
+ '''
+ df = df.sort_values(by="score", ascending=False)
+ tstart = list(df.xmin.values[:])
+ tend = list(df.xmax.values[:])
+ tscore = list(df.score.values[:])
+
+ rstart = []
+ rend = []
+ rscore = []
+
+ while len(tscore) > 1 and len(rscore) < 101:
+ max_index = tscore.index(max(tscore))
+ tmp_iou_list = iou_with_anchors(np.array(tstart), np.array(tend),
+ tstart[max_index], tend[max_index])
+ for idx in range(0, len(tscore)):
+ if idx != max_index:
+ tmp_iou = tmp_iou_list[idx]
+ tmp_width = tend[max_index] - tstart[max_index]
+ if tmp_iou > t1 + (t2 - t1) * tmp_width:
+ tscore[idx] = tscore[idx] * np.exp(
+ -np.square(tmp_iou) / alpha)
+
+ rstart.append(tstart[max_index])
+ rend.append(tend[max_index])
+ rscore.append(tscore[max_index])
+ tstart.pop(max_index)
+ tend.pop(max_index)
+ tscore.pop(max_index)
+
+ newDf = pd.DataFrame()
+ newDf['score'] = rscore
+ newDf['xmin'] = rstart
+ newDf['xmax'] = rend
+ return newDf
+
+
+@METRIC.register
+class BMNMetric(BaseMetric):
+ """
+ Metrics for BMN. Two Stages in this metric:
+ (1) Get test results using trained model, results will be saved in BMNMetric.result_path;
+ (2) Calculate metrics using results file from stage (1).
+ """
+
+ def __init__(self,
+ data_size,
+ batch_size,
+ tscale,
+ dscale,
+ file_path,
+ ground_truth_filename,
+ subset,
+ output_path,
+ result_path,
+ get_metrics=True,
+ log_interval=1):
+ """
+ Init for BMN metrics.
+ Params:
+ get_metrics: whether to calculate AR@N and AUC metrics or not, default True.
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ assert self.batch_size == 1, " Now we just support batch_size==1 test"
+ assert self.world_size == 1, " Now we just support single-card test"
+
+ self.tscale = tscale
+ self.dscale = dscale
+ self.file_path = file_path
+ self.ground_truth_filename = ground_truth_filename
+ self.subset = subset
+ self.output_path = output_path
+ self.result_path = result_path
+ self.get_metrics = get_metrics
+
+ if not os.path.isdir(self.output_path):
+ os.makedirs(self.output_path)
+ if not os.path.isdir(self.result_path):
+ os.makedirs(self.result_path)
+
+ self.video_dict, self.video_list = self.get_dataset_dict(
+ self.file_path, self.subset)
+
+ def get_dataset_dict(self, file_path, subset):
+ annos = json.load(open(file_path))
+ video_dict = {}
+ for video_name in annos.keys():
+ video_subset = annos[video_name]["subset"]
+ if subset in video_subset:
+ video_dict[video_name] = annos[video_name]
+ video_list = list(video_dict.keys())
+ video_list.sort()
+ return video_dict, video_list
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ fid = data[4].numpy()
+ pred_bm, pred_start, pred_end = outputs
+ pred_bm = pred_bm.numpy()
+ pred_start = pred_start[0].numpy()
+ pred_end = pred_end[0].numpy()
+
+ snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]
+ snippet_xmaxs = [
+ 1.0 / self.tscale * i for i in range(1, self.tscale + 1)
+ ]
+ cols = ["xmin", "xmax", "score"]
+
+ video_name = self.video_list[fid[0]]
+ pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]
+ start_mask = boundary_choose(pred_start)
+ start_mask[0] = 1.
+ end_mask = boundary_choose(pred_end)
+ end_mask[-1] = 1.
+ score_vector_list = []
+ for idx in range(self.dscale):
+ for jdx in range(self.tscale):
+ start_index = jdx
+ end_index = start_index + idx
+ if end_index < self.tscale and start_mask[
+ start_index] == 1 and end_mask[end_index] == 1:
+ xmin = snippet_xmins[start_index]
+ xmax = snippet_xmaxs[end_index]
+ xmin_score = pred_start[start_index]
+ xmax_score = pred_end[end_index]
+ bm_score = pred_bm[idx, jdx]
+ conf_score = xmin_score * xmax_score * bm_score
+ score_vector_list.append([xmin, xmax, conf_score])
+
+ score_vector_list = np.stack(score_vector_list)
+ video_df = pd.DataFrame(score_vector_list, columns=cols)
+ video_df.to_csv(os.path.join(self.output_path, "%s.csv" % video_name),
+ index=False)
+
+ if batch_id % self.log_interval == 0:
+ logger.info("Processing................ batch {}".format(batch_id))
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ # check clip index of each video
+ #Stage1
+ self.bmn_post_processing(self.video_dict, self.subset, self.output_path,
+ self.result_path)
+ if self.get_metrics:
+ logger.info("[TEST] calculate metrics...")
+ #Stage2
+ uniform_average_nr_proposals_valid, uniform_average_recall_valid, uniform_recall_valid = self.cal_metrics(
+ self.ground_truth_filename,
+ os.path.join(self.result_path, "bmn_results_validation.json"),
+ max_avg_nr_proposals=100,
+ tiou_thresholds=np.linspace(0.5, 0.95, 10),
+ subset='validation')
+ logger.info("AR@1; AR@5; AR@10; AR@100")
+ logger.info("%.02f %.02f %.02f %.02f" %
+ (100 * np.mean(uniform_recall_valid[:, 0]),
+ 100 * np.mean(uniform_recall_valid[:, 4]),
+ 100 * np.mean(uniform_recall_valid[:, 9]),
+ 100 * np.mean(uniform_recall_valid[:, -1])))
+
+ def bmn_post_processing(self, video_dict, subset, output_path, result_path):
+ video_list = list(video_dict.keys())
+ global result_dict
+ result_dict = mp.Manager().dict()
+ pp_num = 12
+
+ num_videos = len(video_list)
+ num_videos_per_thread = int(num_videos / pp_num)
+ processes = []
+ for tid in range(pp_num - 1):
+ tmp_video_list = video_list[tid * num_videos_per_thread:(tid + 1) *
+ num_videos_per_thread]
+ p = mp.Process(target=self.video_process,
+ args=(tmp_video_list, video_dict, output_path,
+ result_dict))
+ p.start()
+ processes.append(p)
+ tmp_video_list = video_list[(pp_num - 1) * num_videos_per_thread:]
+ p = mp.Process(target=self.video_process,
+ args=(tmp_video_list, video_dict, output_path,
+ result_dict))
+ p.start()
+ processes.append(p)
+ for p in processes:
+ p.join()
+
+ result_dict = dict(result_dict)
+ output_dict = {
+ "version": "VERSION 1.3",
+ "results": result_dict,
+ "external_data": {}
+ }
+ outfile = open(
+ os.path.join(result_path, "bmn_results_%s.json" % subset), "w")
+
+ # json.dump(output_dict, outfile)
+ # in case of file name in chinese
+ json.dump(output_dict, outfile, ensure_ascii=False)
+ outfile.close()
+
+ def video_process(self,
+ video_list,
+ video_dict,
+ output_path,
+ result_dict,
+ snms_alpha=0.4,
+ snms_t1=0.55,
+ snms_t2=0.9):
+
+ for video_name in video_list:
+ logger.info("Processing video........" + video_name)
+ df = pd.read_csv(os.path.join(output_path, video_name + ".csv"))
+ if len(df) > 1:
+ df = soft_nms(df, snms_alpha, snms_t1, snms_t2)
+
+ video_duration = video_dict[video_name]["duration_second"]
+ proposal_list = []
+ for idx in range(min(100, len(df))):
+ tmp_prop={"score":df.score.values[idx], \
+ "segment":[max(0,df.xmin.values[idx])*video_duration, \
+ min(1,df.xmax.values[idx])*video_duration]}
+ proposal_list.append(tmp_prop)
+
+ video_name = video_name[2:] if video_name[:2] == 'v_' else video_name
+ result_dict[video_name] = proposal_list
+
+ def cal_metrics(self,
+ ground_truth_filename,
+ proposal_filename,
+ max_avg_nr_proposals=100,
+ tiou_thresholds=np.linspace(0.5, 0.95, 10),
+ subset='validation'):
+
+ anet_proposal = ANETproposal(ground_truth_filename,
+ proposal_filename,
+ tiou_thresholds=tiou_thresholds,
+ max_avg_nr_proposals=max_avg_nr_proposals,
+ subset=subset,
+ verbose=True,
+ check_status=False)
+ anet_proposal.evaluate()
+ recall = anet_proposal.recall
+ average_recall = anet_proposal.avg_recall
+ average_nr_proposals = anet_proposal.proposals_per_video
+
+ return (average_nr_proposals, average_recall, recall)
diff --git a/paddlevideo/metrics/build.py b/paddlevideo/metrics/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..82e4b502611f3190e5f2c67d9e70fd2201be8233
--- /dev/null
+++ b/paddlevideo/metrics/build.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import METRIC
+from ..utils import build
+
+
+def build_metric(cfg):
+ return build(cfg, METRIC)
diff --git a/paddlevideo/metrics/center_crop_metric.py b/paddlevideo/metrics/center_crop_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3bfb1b23f7ddd23f7e9563b7e8313776987f064
--- /dev/null
+++ b/paddlevideo/metrics/center_crop_metric.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from paddle.hapi.model import _all_gather
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class CenterCropMetric(BaseMetric):
+ def __init__(self, data_size, batch_size, log_interval=1):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.top1 = []
+ self.top5 = []
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ labels = data[1]
+
+ top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+ top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
+ #NOTE(shipping): deal with multi cards validate
+ if self.world_size > 1:
+ top1 = paddle.distributed.all_reduce(
+ top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ top5 = paddle.distributed.all_reduce(
+ top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+ self.top1.append(top1.numpy())
+ self.top5.append(top5.numpy())
+ # preds ensemble
+ if batch_id % self.log_interval == 0:
+ logger.info("[TEST] Processing batch {}/{} ...".format(
+ batch_id,
+ self.data_size // (self.batch_size * self.world_size)))
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {} '.format(
+ np.mean(np.array(self.top1)), np.mean(np.array(self.top5))))
diff --git a/paddlevideo/metrics/center_crop_metric_MRI.py b/paddlevideo/metrics/center_crop_metric_MRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..843a9c36a3eca77feaf4463323d4a25129361e39
--- /dev/null
+++ b/paddlevideo/metrics/center_crop_metric_MRI.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from paddle.hapi.model import _all_gather
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class CenterCropMetric_MRI(BaseMetric):
+ def __init__(self, data_size, batch_size, log_interval=1, if_slowfast=0):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.top1 = []
+ self.if_slowfast = if_slowfast
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ labels = data[1]
+
+ if self.if_slowfast:
+ labels = data[2]
+
+ top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+ #top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
+ #NOTE(shipping): deal with multi cards validate
+ if self.world_size > 1:
+ top1 = paddle.distributed.all_reduce(
+ top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ # top5 = paddle.distributed.all_reduce(
+ # top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+ self.top1.append(top1.numpy())
+ #self.top5.append(top5.numpy())
+ # preds ensemble
+ if batch_id % self.log_interval == 0:
+ logger.info("[TEST] Processing batch {}/{} ...".format(
+ batch_id,
+ self.data_size // (self.batch_size * self.world_size)))
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ logger.info('[TEST] finished, avg_acc1= {}'.format(
+ np.mean(np.array(self.top1))))
diff --git a/paddlevideo/metrics/depth_metric.py b/paddlevideo/metrics/depth_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..c160e16bae02397bacd8bc667421d29ddf402853
--- /dev/null
+++ b/paddlevideo/metrics/depth_metric.py
@@ -0,0 +1,77 @@
+import numpy as np
+import paddle
+from paddlevideo.utils import get_logger
+
+from .base import BaseMetric
+from .registry import METRIC
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class DepthMetric(BaseMetric):
+ def __init__(self, data_size, batch_size, log_interval=1):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.abs_rel = []
+ self.sq_rel = []
+ self.rmse = []
+ self.rmse_log = []
+ self.a1 = []
+ self.a2 = []
+ self.a3 = []
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = outputs['abs_rel'], outputs['sq_rel'], outputs['rmse'], \
+ outputs['rmse_log'], outputs['a1'], outputs['a2'],outputs['a3']
+ # preds ensemble
+ if self.world_size > 1:
+ abs_rel = paddle.distributed.all_reduce(
+ outputs['abs_rel'],
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ sq_rel = paddle.distributed.all_reduce(
+ outputs['sq_rel'],
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ rmse = paddle.distributed.all_reduce(
+ outputs['rmse'],
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ rmse_log = paddle.distributed.all_reduce(
+ outputs['rmse_log'],
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ a1 = paddle.distributed.all_reduce(
+ outputs['a1'],
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ a2 = paddle.distributed.all_reduce(
+ outputs['a2'],
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ a3 = paddle.distributed.all_reduce(
+ outputs['a3'],
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+ self.abs_rel.append(abs_rel)
+ self.sq_rel.append(sq_rel)
+ self.rmse.append(rmse)
+ self.rmse_log.append(rmse_log)
+ self.a1.append(a1)
+ self.a2.append(a2)
+ self.a3.append(a3)
+ if batch_id % self.log_interval == 0:
+ logger.info("[TEST] Processing batch {}/{} ...".format(
+ batch_id,
+ self.data_size // (self.batch_size * self.world_size)))
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ logger.info(
+ '[TEST] finished, abs_rel= {}, sq_rel= {} , rmse= {}, rmse_log= {},'
+ 'a1= {}, a2= {}, a3= {}'.format(np.mean(np.array(self.abs_rel)),
+ np.mean(np.array(self.sq_rel)),
+ np.mean(np.array(self.rmse)),
+ np.mean(np.array(self.rmse_log)),
+ np.mean(np.array(self.a1)),
+ np.mean(np.array(self.a2)),
+ np.mean(np.array(self.a3))))
diff --git a/paddlevideo/metrics/msrvtt_metric.py b/paddlevideo/metrics/msrvtt_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec452f2e4d07f0b54de98330a2eca8fe3e6f7187
--- /dev/null
+++ b/paddlevideo/metrics/msrvtt_metric.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle.hapi.model import _all_gather
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class MSRVTTMetric(BaseMetric):
+ def __init__(self, data_size, batch_size, log_interval=1):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.score_matrix = np.zeros((data_size, data_size))
+ self.target_matrix = np.zeros((data_size, data_size))
+ self.rank_matrix = np.ones((data_size)) * data_size
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ target = data[-1]
+ cm_logit = outputs[-1]
+
+ self.score_matrix[batch_id, :] = F.softmax(
+ cm_logit, axis=1)[:, 0].reshape([-1]).numpy()
+ self.target_matrix[batch_id, :] = target.reshape([-1]).numpy()
+
+ rank = np.where((np.argsort(-self.score_matrix[batch_id]) == np.where(
+ self.target_matrix[batch_id] == 1)[0][0]) == 1)[0][0]
+ self.rank_matrix[batch_id] = rank
+
+ rank_matrix_tmp = self.rank_matrix[:batch_id + 1]
+ r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp)
+ r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp)
+ r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp)
+
+ medr = np.floor(np.median(rank_matrix_tmp) + 1)
+ meanr = np.mean(rank_matrix_tmp) + 1
+ logger.info(
+ "[{}] Final r1:{:.3f}, r5:{:.3f}, r10:{:.3f}, mder:{:.3f}, meanr:{:.3f}"
+ .format(batch_id, r1, r5, r10, medr, meanr))
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ logger.info("Eval Finished!")
diff --git a/paddlevideo/metrics/multi_crop_metric.py b/paddlevideo/metrics/multi_crop_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cad6679d63f6d4bb092b5b9d222c41086bbc037
--- /dev/null
+++ b/paddlevideo/metrics/multi_crop_metric.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from paddle.hapi.model import _all_gather
+
+from paddlevideo.utils import get_logger
+from .registry import METRIC
+from .base import BaseMetric
+
+logger = get_logger("paddlevideo")
+""" An example for metrics class.
+ MultiCropMetric for slowfast.
+"""
+
+
+@METRIC.register
+class MultiCropMetric(BaseMetric):
+ def __init__(self,
+ data_size,
+ batch_size,
+ num_ensemble_views,
+ num_spatial_crops,
+ num_classes,
+ log_interval=1):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.num_ensemble_views = num_ensemble_views
+ self.num_spatial_crops = num_spatial_crops
+ self.num_classes = num_classes
+
+ self.num_clips = self.num_ensemble_views * self.num_spatial_crops
+ num_videos = self.data_size // self.num_clips
+ self.video_preds = np.zeros((num_videos, self.num_classes))
+ self.video_labels = np.zeros((num_videos, 1), dtype="int64")
+ self.clip_count = {}
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ labels = data[2]
+ clip_ids = data[3]
+
+ # gather mulit card, results of following process in each card is the same.
+ if self.world_size > 1:
+ outputs = _all_gather(outputs, self.world_size)
+ labels = _all_gather(labels, self.world_size)
+ clip_ids = _all_gather(clip_ids, self.world_size)
+
+ # to numpy
+ preds = outputs.numpy()
+ labels = labels.numpy().astype("int64")
+ clip_ids = clip_ids.numpy()
+
+ # preds ensemble
+ for ind in range(preds.shape[0]):
+ vid_id = int(clip_ids[ind]) // self.num_clips
+ ts_idx = int(clip_ids[ind]) % self.num_clips
+ if vid_id not in self.clip_count:
+ self.clip_count[vid_id] = []
+ if ts_idx in self.clip_count[vid_id]:
+ logger.info(
+ "[TEST] Passed!! read video {} clip index {} / {} repeatedly."
+ .format(vid_id, ts_idx, clip_ids[ind]))
+ else:
+ self.clip_count[vid_id].append(ts_idx)
+ self.video_preds[vid_id] += preds[ind] # ensemble method: sum
+ if self.video_labels[vid_id].sum() > 0:
+ assert self.video_labels[vid_id] == labels[ind]
+ self.video_labels[vid_id] = labels[ind]
+ if batch_id % self.log_interval == 0:
+ logger.info("[TEST] Processing batch {}/{} ...".format(
+ batch_id,
+ self.data_size // (self.batch_size * self.world_size)))
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ # check clip index of each video
+ for key in self.clip_count.keys():
+ if len(self.clip_count[key]) != self.num_clips or sum(
+ self.clip_count[key]) != self.num_clips * (self.num_clips -
+ 1) / 2:
+ logger.info(
+ "[TEST] Count Error!! video [{}] clip count [{}] not match number clips {}"
+ .format(key, self.clip_count[key], self.num_clips))
+
+ video_preds = paddle.to_tensor(self.video_preds)
+ video_labels = paddle.to_tensor(self.video_labels)
+ acc_top1 = paddle.metric.accuracy(input=video_preds,
+ label=video_labels,
+ k=1)
+ acc_top5 = paddle.metric.accuracy(input=video_preds,
+ label=video_labels,
+ k=5)
+ logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {} '.format(
+ acc_top1.numpy(), acc_top5.numpy()))
diff --git a/paddlevideo/metrics/recall.py b/paddlevideo/metrics/recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..3612e2244d430c593a859694f8b2c7abfb63134f
--- /dev/null
+++ b/paddlevideo/metrics/recall.py
@@ -0,0 +1,84 @@
+import numpy as np
+import paddle
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+ img_num = all_ious.shape[0]
+ total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+ ious_ = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+ for k, proposal_num in enumerate(proposal_nums):
+ tmp_ious = np.zeros(0)
+ for i in range(img_num):
+ ious = all_ious[i][:, :proposal_num].copy()
+ gt_ious = np.zeros(ious.shape[0])
+ if ious.size == 0:
+ tmp_ious = np.hstack((tmp_ious, gt_ious))
+ continue
+ for j in range(ious.shape[0]):
+ gt_max_overlaps = ious.argmax(axis=1)
+ max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+ gt_idx = max_ious.argmax()
+ gt_ious[j] = max_ious[gt_idx]
+ box_idx = gt_max_overlaps[gt_idx]
+ ious[gt_idx, :] = -1
+ ious[:, box_idx] = -1
+ tmp_ious = np.hstack((tmp_ious, gt_ious))
+ ious_[k, :] = tmp_ious
+
+ ious_ = np.fliplr(np.sort(ious_, axis=1))
+ recalls = np.zeros((proposal_nums.size, thrs.size))
+ for i, thr in enumerate(thrs):
+ recalls[:, i] = (ious_ >= thr).sum(axis=1) / float(total_gt_num)
+
+ return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+ if isinstance(proposal_nums, list):
+ proposal_nums_ = np.array(proposal_nums)
+ elif isinstance(proposal_nums, int):
+ proposal_nums_ = np.array([proposal_nums])
+ else:
+ proposal_nums_ = proposal_nums
+
+ if iou_thrs is None:
+ _iou_thrs = np.array([0.5])
+ elif isinstance(iou_thrs, list):
+ _iou_thrs = np.array(iou_thrs)
+ elif isinstance(iou_thrs, float):
+ _iou_thrs = np.array([iou_thrs])
+ else:
+ _iou_thrs = iou_thrs
+
+ return proposal_nums_, _iou_thrs
+
+
+def eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None):
+ """Calculate recalls. """
+ img_num = len(gts)
+ assert img_num == len(proposals)
+
+ proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+ all_ious = []
+ for i in range(img_num):
+ if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+ scores = proposals[i][:, 4]
+ sort_idx = np.argsort(scores)[::-1]
+ img_proposal = proposals[i][sort_idx, :]
+ else:
+ img_proposal = proposals[i]
+
+ prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+ if gts[i] is None or gts[i].shape[0] == 0:
+ ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+ else:
+ ious = bbox_overlaps(
+ torch.tensor(gts[i]),
+ torch.tensor(img_proposal[:prop_num, :4]))
+ ious = ious.data.numpy()
+ all_ious.append(ious)
+ all_ious = np.array(all_ious)
+ recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+ return recalls
diff --git a/paddlevideo/metrics/registry.py b/paddlevideo/metrics/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..221444023345011cfe6f0922fa939a635b46d738
--- /dev/null
+++ b/paddlevideo/metrics/registry.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+METRIC = Registry('metric')
diff --git a/paddlevideo/metrics/segmentation_metric.py b/paddlevideo/metrics/segmentation_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..3719450e4a12a825468a077c1f1084c849f36d9b
--- /dev/null
+++ b/paddlevideo/metrics/segmentation_metric.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import argparse
+import pandas as pd
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def get_labels_scores_start_end_time(input_np,
+ frame_wise_labels,
+ actions_dict,
+ bg_class=["background", "None"]):
+ labels = []
+ starts = []
+ ends = []
+ scores = []
+
+ boundary_score_ptr = 0
+
+ last_label = frame_wise_labels[0]
+ if frame_wise_labels[0] not in bg_class:
+ labels.append(frame_wise_labels[0])
+ starts.append(0)
+ for i in range(len(frame_wise_labels)):
+ if frame_wise_labels[i] != last_label:
+ if frame_wise_labels[i] not in bg_class:
+ labels.append(frame_wise_labels[i])
+ starts.append(i)
+ if last_label not in bg_class:
+ ends.append(i)
+ score = np.mean(
+ input_np[actions_dict[labels[boundary_score_ptr]], \
+ starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]
+ )
+ scores.append(score)
+ boundary_score_ptr = boundary_score_ptr + 1
+ last_label = frame_wise_labels[i]
+ if last_label not in bg_class:
+ ends.append(i + 1)
+ score = np.mean(
+ input_np[actions_dict[labels[boundary_score_ptr]], \
+ starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]
+ )
+ scores.append(score)
+ boundary_score_ptr = boundary_score_ptr + 1
+
+ return labels, starts, ends, scores
+
+
+def get_labels_start_end_time(frame_wise_labels,
+ bg_class=["background", "None"]):
+ labels = []
+ starts = []
+ ends = []
+ last_label = frame_wise_labels[0]
+ if frame_wise_labels[0] not in bg_class:
+ labels.append(frame_wise_labels[0])
+ starts.append(0)
+ for i in range(len(frame_wise_labels)):
+ if frame_wise_labels[i] != last_label:
+ if frame_wise_labels[i] not in bg_class:
+ labels.append(frame_wise_labels[i])
+ starts.append(i)
+ if last_label not in bg_class:
+ ends.append(i)
+ last_label = frame_wise_labels[i]
+ if last_label not in bg_class:
+ ends.append(i + 1)
+ return labels, starts, ends
+
+
+def levenstein(p, y, norm=False):
+ m_row = len(p)
+ n_col = len(y)
+ D = np.zeros([m_row + 1, n_col + 1], np.float)
+ for i in range(m_row + 1):
+ D[i, 0] = i
+ for i in range(n_col + 1):
+ D[0, i] = i
+
+ for j in range(1, n_col + 1):
+ for i in range(1, m_row + 1):
+ if y[j - 1] == p[i - 1]:
+ D[i, j] = D[i - 1, j - 1]
+ else:
+ D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+ D[i - 1, j - 1] + 1)
+
+ if norm:
+ score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+ else:
+ score = D[-1, -1]
+
+ return score
+
+
+def edit_score(recognized,
+ ground_truth,
+ norm=True,
+ bg_class=["background", "None"]):
+ P, _, _ = get_labels_start_end_time(recognized, bg_class)
+ Y, _, _ = get_labels_start_end_time(ground_truth, bg_class)
+ return levenstein(P, Y, norm)
+
+
+def f_score(recognized, ground_truth, overlap, bg_class=["background", "None"]):
+ p_label, p_start, p_end = get_labels_start_end_time(recognized, bg_class)
+ y_label, y_start, y_end = get_labels_start_end_time(ground_truth, bg_class)
+
+ tp = 0
+ fp = 0
+
+ hits = np.zeros(len(y_label))
+
+ for j in range(len(p_label)):
+ intersection = np.minimum(p_end[j], y_end) - np.maximum(
+ p_start[j], y_start)
+ union = np.maximum(p_end[j], y_end) - np.minimum(p_start[j], y_start)
+ IoU = (1.0 * intersection / union) * (
+ [p_label[j] == y_label[x] for x in range(len(y_label))])
+ # Get the best scoring segment
+ idx = np.array(IoU).argmax()
+
+ if IoU[idx] >= overlap and not hits[idx]:
+ tp += 1
+ hits[idx] = 1
+ else:
+ fp += 1
+ fn = len(y_label) - sum(hits)
+ return float(tp), float(fp), float(fn)
+
+
+def boundary_AR(pred_boundary, gt_boundary, overlap_list, max_proposal):
+
+ p_label, p_start, p_end, p_scores = pred_boundary
+ y_label, y_start, y_end, _ = gt_boundary
+
+ # sort proposal
+ pred_dict = {
+ "label": p_label,
+ "start": p_start,
+ "end": p_end,
+ "scores": p_scores
+ }
+ pdf = pd.DataFrame(pred_dict)
+ pdf = pdf.sort_values(by="scores", ascending=False)
+ p_label = list(pdf["label"])
+ p_start = list(pdf["start"])
+ p_end = list(pdf["end"])
+ p_scores = list(pdf["scores"])
+
+ # refine AN
+ if len(p_label) < max_proposal and len(p_label) > 0:
+ p_label = p_label + [p_label[-1]] * (max_proposal - len(p_label))
+ p_start = p_start + [p_start[-1]] * (max_proposal - len(p_start))
+ p_start = p_start + p_start[len(p_start) -
+ (max_proposal - len(p_start)):]
+ p_end = p_end + [p_end[-1]] * (max_proposal - len(p_end))
+ p_scores = p_scores + [p_scores[-1]] * (max_proposal - len(p_scores))
+ elif len(p_label) > max_proposal:
+ p_label[max_proposal:] = []
+ p_start[max_proposal:] = []
+ p_end[max_proposal:] = []
+ p_scores[max_proposal:] = []
+
+ t_AR = np.zeros(len(overlap_list))
+
+ for i in range(len(overlap_list)):
+ overlap = overlap_list[i]
+
+ tp = 0
+ fp = 0
+ hits = np.zeros(len(y_label))
+
+ for j in range(len(p_label)):
+ intersection = np.minimum(p_end[j], y_end) - np.maximum(
+ p_start[j], y_start)
+ union = np.maximum(p_end[j], y_end) - np.minimum(
+ p_start[j], y_start)
+ IoU = (1.0 * intersection / union)
+ # Get the best scoring segment
+ idx = np.array(IoU).argmax()
+
+ if IoU[idx] >= overlap and not hits[idx]:
+ tp += 1
+ hits[idx] = 1
+ else:
+ fp += 1
+ fn = len(y_label) - sum(hits)
+
+ recall = float(tp) / (float(tp) + float(fn))
+ t_AR[i] = recall
+
+ AR = np.mean(t_AR)
+ return AR
+
+
+@METRIC.register
+class SegmentationMetric(BaseMetric):
+ """
+ Test for Video Segmentation based model.
+ """
+
+ def __init__(self,
+ data_size,
+ batch_size,
+ overlap,
+ actions_map_file_path,
+ log_interval=1,
+ tolerance=5,
+ boundary_threshold=0.7,
+ max_proposal=100):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ # actions dict generate
+ file_ptr = open(actions_map_file_path, 'r')
+ actions = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+ self.actions_dict = dict()
+ for a in actions:
+ self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+ # cls score
+ self.overlap = overlap
+ self.overlap_len = len(overlap)
+
+ self.cls_tp = np.zeros(self.overlap_len)
+ self.cls_fp = np.zeros(self.overlap_len)
+ self.cls_fn = np.zeros(self.overlap_len)
+ self.total_correct = 0
+ self.total_edit = 0
+ self.total_frame = 0
+ self.total_video = 0
+
+ # boundary score
+ self.max_proposal = max_proposal
+ self.AR_at_AN = [[] for _ in range(max_proposal)]
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ groundTruth = data[1]
+
+ predicted = outputs['predict']
+ output_np = outputs['output_np']
+
+ outputs_np = predicted.numpy()
+ outputs_arr = output_np.numpy()[0, :]
+ gt_np = groundTruth.numpy()[0, :]
+
+ recognition = []
+ for i in range(outputs_np.shape[0]):
+ recognition = np.concatenate((recognition, [
+ list(self.actions_dict.keys())[list(
+ self.actions_dict.values()).index(outputs_np[i])]
+ ]))
+ recog_content = list(recognition)
+
+ gt_content = []
+ for i in range(gt_np.shape[0]):
+ gt_content = np.concatenate((gt_content, [
+ list(self.actions_dict.keys())[list(
+ self.actions_dict.values()).index(gt_np[i])]
+ ]))
+ gt_content = list(gt_content)
+
+ pred_boundary = get_labels_scores_start_end_time(
+ outputs_arr, recog_content, self.actions_dict)
+ gt_boundary = get_labels_scores_start_end_time(
+ np.ones(outputs_arr.shape), gt_content, self.actions_dict)
+
+ # cls score
+ correct = 0
+ total = 0
+ edit = 0
+
+ for i in range(len(gt_content)):
+ total += 1
+ #accumulate
+ self.total_frame += 1
+
+ if gt_content[i] == recog_content[i]:
+ correct += 1
+ #accumulate
+ self.total_correct += 1
+
+ edit_num = edit_score(recog_content, gt_content)
+ edit += edit_num
+ self.total_edit += edit_num
+
+ for s in range(self.overlap_len):
+ tp1, fp1, fn1 = f_score(recog_content, gt_content, self.overlap[s])
+
+ # accumulate
+ self.cls_tp[s] += tp1
+ self.cls_fp[s] += fp1
+ self.cls_fn[s] += fn1
+
+ # accumulate
+ self.total_video += 1
+
+ # proposal score
+ for AN in range(self.max_proposal):
+ AR = boundary_AR(pred_boundary,
+ gt_boundary,
+ self.overlap,
+ max_proposal=(AN + 1))
+ self.AR_at_AN[AN].append(AR)
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ # cls metric
+ Acc = 100 * float(self.total_correct) / self.total_frame
+ Edit = (1.0 * self.total_edit) / self.total_video
+ Fscore = dict()
+ for s in range(self.overlap_len):
+ precision = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fp[s])
+ recall = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fn[s])
+
+ f1 = 2.0 * (precision * recall) / (precision + recall)
+
+ f1 = np.nan_to_num(f1) * 100
+ Fscore[self.overlap[s]] = f1
+
+ # proposal metric
+ proposal_AUC = np.array(self.AR_at_AN) * 100
+ AUC = np.mean(proposal_AUC)
+ AR_at_AN1 = np.mean(proposal_AUC[0, :])
+ AR_at_AN5 = np.mean(proposal_AUC[4, :])
+ AR_at_AN15 = np.mean(proposal_AUC[14, :])
+
+ # log metric
+ log_mertic_info = "dataset model performence: "
+ # preds ensemble
+ log_mertic_info += "Acc: {:.4f}, ".format(Acc)
+ log_mertic_info += 'Edit: {:.4f}, '.format(Edit)
+ for s in range(len(self.overlap)):
+ log_mertic_info += 'F1@{:0.2f}: {:.4f}, '.format(
+ self.overlap[s], Fscore[self.overlap[s]])
+
+ # boundary metric
+ log_mertic_info += "Auc: {:.4f}, ".format(AUC)
+ log_mertic_info += "AR@AN1: {:.4f}, ".format(AR_at_AN1)
+ log_mertic_info += "AR@AN5: {:.4f}, ".format(AR_at_AN5)
+ log_mertic_info += "AR@AN15: {:.4f}, ".format(AR_at_AN15)
+ logger.info(log_mertic_info)
+
+ # log metric
+ metric_dict = dict()
+ metric_dict['Acc'] = Acc
+ metric_dict['Edit'] = Edit
+ for s in range(len(self.overlap)):
+ metric_dict['F1@{:0.2f}'.format(
+ self.overlap[s])] = Fscore[self.overlap[s]]
+ metric_dict['Auc'] = AUC
+ metric_dict['AR@AN1'] = AR_at_AN1
+ metric_dict['AR@AN5'] = AR_at_AN5
+ metric_dict['AR@AN15'] = AR_at_AN15
+
+ # clear for next epoch
+ # cls
+ self.cls_tp = np.zeros(self.overlap_len)
+ self.cls_fp = np.zeros(self.overlap_len)
+ self.cls_fn = np.zeros(self.overlap_len)
+ self.total_correct = 0
+ self.total_edit = 0
+ self.total_frame = 0
+ self.total_video = 0
+ # proposal
+ self.AR_at_AN = [[] for _ in range(self.max_proposal)]
+
+ return metric_dict
diff --git a/paddlevideo/metrics/skeleton_metric.py b/paddlevideo/metrics/skeleton_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c2e762fba7ce0a8585bb75928a57364b4a7be0e
--- /dev/null
+++ b/paddlevideo/metrics/skeleton_metric.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+import csv
+import paddle.nn.functional as F
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class SkeletonMetric(BaseMetric):
+ """
+ Test for Skeleton based model.
+ note: only support batch size = 1, single card test.
+
+ Args:
+ out_file: str, file to save test results.
+ """
+
+ def __init__(self,
+ data_size,
+ batch_size,
+ out_file='submission.csv',
+ log_interval=1,
+ top_k=5):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.top1 = []
+ self.top5 = []
+ self.values = []
+ self.out_file = out_file
+ self.k = top_k
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ if len(data) == 2: # data with label
+ labels = data[1]
+ top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+ top5 = paddle.metric.accuracy(input=outputs, label=labels, k=self.k)
+ if self.world_size > 1:
+ top1 = paddle.distributed.all_reduce(
+ top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ top5 = paddle.distributed.all_reduce(
+ top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ self.top1.append(top1.numpy())
+ self.top5.append(top5.numpy())
+ else: # data without label, only support batch_size=1. Used for fsd-10.
+ prob = F.softmax(outputs)
+ clas = paddle.argmax(prob, axis=1).numpy()[0]
+ self.values.append((batch_id, clas))
+
+ # preds ensemble
+ if batch_id % self.log_interval == 0:
+ logger.info("[TEST] Processing batch {}/{} ...".format(
+ batch_id,
+ self.data_size // (self.batch_size * self.world_size)))
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ if self.top1: # data with label
+ logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {}'.format(
+ np.mean(np.array(self.top1)), np.mean(np.array(self.top5))))
+ else:
+ headers = ['sample_index', 'predict_category']
+ with open(
+ self.out_file,
+ 'w',
+ ) as fp:
+ writer = csv.writer(fp)
+ writer.writerow(headers)
+ writer.writerows(self.values)
+ logger.info("Results saved in {} !".format(self.out_file))
diff --git a/paddlevideo/metrics/transnetv2_metric.py b/paddlevideo/metrics/transnetv2_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..33708817602591ce126270d6b044ed103d888521
--- /dev/null
+++ b/paddlevideo/metrics/transnetv2_metric.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def predictions_to_scenes(predictions):
+ scenes = []
+ t, t_prev, start = -1, 0, 0
+ for i, t in enumerate(predictions):
+ if t_prev == 1 and t == 0:
+ start = i
+ if t_prev == 0 and t == 1 and i != 0:
+ scenes.append([start, i])
+ t_prev = t
+ if t == 0:
+ scenes.append([start, i])
+
+ # just fix if all predictions are 1
+ if len(scenes) == 0:
+ return np.array([[0, len(predictions) - 1]], dtype=np.int32)
+
+ return np.array(scenes, dtype=np.int32)
+
+
+def evaluate_scenes(gt_scenes, pred_scenes, n_frames_miss_tolerance=2):
+ """
+ Adapted from: https://github.com/gyglim/shot-detection-evaluation
+ The original based on: http://imagelab.ing.unimore.it/imagelab/researchActivity.asp?idActivity=19
+
+ n_frames_miss_tolerance:
+ Number of frames it is possible to miss ground truth by, and still being counted as a correct detection.
+
+ Examples of computation with different tolerance margin:
+ n_frames_miss_tolerance = 0
+ pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.5, 5.5]]
+ gt_scenes: [[0, 5], [6, 9]] -> gt_trans: [[5.5, 5.5]] -> HIT
+ gt_scenes: [[0, 4], [5, 9]] -> gt_trans: [[4.5, 4.5]] -> MISS
+ n_frames_miss_tolerance = 1
+ pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.0, 6.0]]
+ gt_scenes: [[0, 5], [6, 9]] -> gt_trans: [[5.0, 6.0]] -> HIT
+ gt_scenes: [[0, 4], [5, 9]] -> gt_trans: [[4.0, 5.0]] -> HIT
+ gt_scenes: [[0, 3], [4, 9]] -> gt_trans: [[3.0, 4.0]] -> MISS
+ n_frames_miss_tolerance = 2
+ pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[4.5, 6.5]]
+ gt_scenes: [[0, 5], [6, 9]] -> gt_trans: [[4.5, 6.5]] -> HIT
+ gt_scenes: [[0, 4], [5, 9]] -> gt_trans: [[3.5, 5.5]] -> HIT
+ gt_scenes: [[0, 3], [4, 9]] -> gt_trans: [[2.5, 4.5]] -> HIT
+ gt_scenes: [[0, 2], [3, 9]] -> gt_trans: [[1.5, 3.5]] -> MISS
+
+ Users should be careful about adopting these functions in any commercial matters.
+ """
+
+ shift = n_frames_miss_tolerance / 2
+ gt_scenes = gt_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])
+ pred_scenes = pred_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])
+
+ gt_trans = np.stack([gt_scenes[:-1, 1], gt_scenes[1:, 0]], 1)
+ pred_trans = np.stack([pred_scenes[:-1, 1], pred_scenes[1:, 0]], 1)
+
+ i, j = 0, 0
+ tp, fp, fn = 0, 0, 0
+
+ while i < len(gt_trans) or j < len(pred_trans):
+ if j == len(pred_trans) or pred_trans[j, 0] > gt_trans[i, 1]:
+ fn += 1
+ i += 1
+ elif i == len(gt_trans) or pred_trans[j, 1] < gt_trans[i, 0]:
+ fp += 1
+ j += 1
+ else:
+ i += 1
+ j += 1
+ tp += 1
+
+ if tp + fp != 0:
+ p = tp / (tp + fp)
+ else:
+ p = 0
+
+ if tp + fn != 0:
+ r = tp / (tp + fn)
+ else:
+ r = 0
+
+ if p + r != 0:
+ f1 = (p * r * 2) / (p + r)
+ else:
+ f1 = 0
+
+ assert tp + fn == len(gt_trans)
+ assert tp + fp == len(pred_trans)
+
+ return p, r, f1, (tp, fp, fn)
+
+
+def create_scene_based_summaries(one_hot_pred, one_hot_gt):
+ thresholds = np.array([
+ 0.02, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9
+ ])
+ precision, recall, f1, tp, fp, fn = np.zeros_like(thresholds), np.zeros_like(thresholds),\
+ np.zeros_like(thresholds), np.zeros_like(thresholds),\
+ np.zeros_like(thresholds), np.zeros_like(thresholds)
+
+ gt_scenes = predictions_to_scenes(one_hot_gt)
+ for i in range(len(thresholds)):
+ pred_scenes = predictions_to_scenes(
+ (one_hot_pred > thresholds[i]).astype(np.uint8)
+ )
+ precision[i], recall[i], f1[i], (tp[i], fp[i], fn[i]) = evaluate_scenes(gt_scenes, pred_scenes)
+
+ best_idx = np.argmax(f1)
+
+ return f1[best_idx]
+
+
+@METRIC.register
+class TransNetV2Metric(BaseMetric):
+ def __init__(self, data_size, batch_size, log_interval=1):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.predictions = []
+ self.total_stats = {"tp": 0, "fp": 0, "fn": 0}
+
+ def update(self, batch_id, data, one_hot):
+ """update metrics during each iter
+ """
+ if isinstance(one_hot, tuple):
+ one_hot = one_hot[0]
+ one_hot = paddle.nn.functional.sigmoid(one_hot)[0]
+ self.predictions.append(one_hot.numpy()[25:75])
+ gt_scenes = data[1]
+ is_new_file = data[2]
+ if is_new_file:
+ self.compute(gt_scenes)
+ # preds ensemble
+ if batch_id % self.log_interval == 0:
+ logger.info("[TEST] Processing batch {}/{} ...".format(
+ batch_id,
+ self.data_size // (self.batch_size * self.world_size)))
+
+ def compute(self, gt_scenes):
+ predictions = np.concatenate(self.predictions, 0)[:len(frames)]
+ _, _, _, (tp, fp, fn), fp_mistakes, fn_mistakes = evaluate_scenes(
+ gt_scenes, predictions_to_scenes((predictions >= args.thr).astype(np.uint8)))
+
+ self.total_stats["tp"] += tp
+ self.total_stats["fp"] += fp
+ self.total_stats["fn"] += fn
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ p = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fp"])
+ r = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fn"])
+ f1 = (p * r * 2) / (p + r)
+ logger.info('[TEST] finished, Precision= {:5.2f}, Recall= {:5.2f} , F1 Score= {:5.2f} '.format(
+ p * 100, r * 100, f1 * 100))
\ No newline at end of file
diff --git a/paddlevideo/metrics/vos_metric.py b/paddlevideo/metrics/vos_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..19762f68609ce923d858bb687b785593b15b1e72
--- /dev/null
+++ b/paddlevideo/metrics/vos_metric.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import paddle
+import zipfile
+import time
+from PIL import Image
+
+from paddle.io import DataLoader
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class VOSMetric(BaseMetric):
+ def __init__(self,
+ data_size,
+ batch_size,
+ result_root,
+ zip_dir,
+ log_interval=1):
+ """prepare for metrics
+ """
+ super().__init__(data_size, batch_size, log_interval)
+ self.video_num = 0
+ self.total_time = 0
+ self.total_frame = 0
+ self.total_sfps = 0
+ self.total_video_num = data_size
+ self.count = 0
+ self.result_root = result_root
+ self.zip_dir = zip_dir
+
+ def update(self, batch_id, data, model):
+ """update metrics during each iter
+ """
+ self.video_num += 1
+ seq_dataset = data
+ seq_name = seq_dataset.seq_name
+
+ logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name, self.video_num,
+ self.total_video_num))
+ seq_dataloader = DataLoader(seq_dataset,
+ return_list=True,
+ batch_size=1,
+ shuffle=False,
+ num_workers=0)
+ seq_total_time = 0
+ seq_total_frame = 0
+ ref_embeddings = []
+ ref_masks = []
+ prev_embedding = []
+ prev_mask = []
+ with paddle.no_grad():
+ for frame_idx, samples in enumerate(seq_dataloader):
+ time_start = time.time()
+ all_preds = []
+ join_label = None
+ for aug_idx in range(len(samples)):
+ if len(ref_embeddings) <= aug_idx:
+ ref_embeddings.append([])
+ ref_masks.append([])
+ prev_embedding.append(None)
+ prev_mask.append(None)
+
+ sample = samples[aug_idx]
+ ref_emb = ref_embeddings[aug_idx]
+ ref_m = ref_masks[aug_idx]
+ prev_emb = prev_embedding[aug_idx]
+ prev_m = prev_mask[aug_idx]
+
+ current_img = sample['current_img']
+ if 'current_label' in sample.keys():
+ current_label = sample['current_label']
+ current_label = paddle.to_tensor(current_label)
+ else:
+ current_label = None
+
+ obj_num = sample['meta']['obj_num']
+ imgname = sample['meta']['current_name']
+ ori_height = sample['meta']['height']
+ ori_width = sample['meta']['width']
+ current_img = current_img
+ obj_num = obj_num
+ bs, _, h, w = current_img.shape
+ data_batch = [
+ ref_emb, ref_m, prev_emb, prev_m, current_img,
+ [ori_height, ori_width], obj_num
+ ]
+
+ all_pred, current_embedding = model(data_batch, mode='test')
+
+ if frame_idx == 0:
+ if current_label is None:
+ logger.info(
+ "No first frame label in Seq {}.".format(
+ seq_name))
+ ref_embeddings[aug_idx].append(current_embedding)
+ ref_masks[aug_idx].append(current_label)
+
+ prev_embedding[aug_idx] = current_embedding
+ prev_mask[aug_idx] = current_label
+ else:
+ if sample['meta']['flip']: #False
+ all_pred = self.flip_tensor(all_pred, 3)
+ # In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we
+ # have to introduce new labels for new objects, if necessary.
+ if not sample['meta']['flip'] and not (
+ current_label is None) and join_label is None:
+ join_label = paddle.cast(current_label,
+ dtype='int64')
+ all_preds.append(all_pred)
+ if current_label is not None:
+ ref_embeddings[aug_idx].append(current_embedding)
+ prev_embedding[aug_idx] = current_embedding
+
+ if frame_idx > 0:
+ all_preds = paddle.concat(all_preds, axis=0)
+ all_preds = paddle.mean(
+ all_preds, axis=0) #average results if augmentation
+ pred_label = paddle.argmax(all_preds, axis=0)
+ if join_label is not None:
+ join_label = paddle.squeeze(paddle.squeeze(join_label,
+ axis=0),
+ axis=0)
+ keep = paddle.cast((join_label == 0), dtype="int64")
+ pred_label = pred_label * keep + join_label * (1 - keep)
+ pred_label = pred_label
+ current_label = paddle.reshape(
+ pred_label, shape=[1, 1, ori_height, ori_width])
+ flip_pred_label = self.flip_tensor(pred_label, 1)
+ flip_current_label = paddle.reshape(
+ flip_pred_label, shape=[1, 1, ori_height, ori_width])
+
+ for aug_idx in range(len(samples)):
+ if join_label is not None:
+ if samples[aug_idx]['meta']['flip']:
+ ref_masks[aug_idx].append(flip_current_label)
+ else:
+ ref_masks[aug_idx].append(current_label)
+ if samples[aug_idx]['meta']['flip']:
+ prev_mask[aug_idx] = flip_current_label
+ else:
+ prev_mask[
+ aug_idx] = current_label #update prev_mask
+
+ one_frametime = time.time() - time_start
+ seq_total_time += one_frametime
+ seq_total_frame += 1
+ obj_num = obj_num.numpy()[0].item()
+ logger.info('Frame: {}, Obj Num: {}, Time: {}'.format(
+ imgname[0], obj_num, one_frametime))
+ self.save_mask(
+ pred_label,
+ os.path.join(self.result_root, seq_name,
+ imgname[0].split('.')[0] + '.png'))
+ else:
+ one_frametime = time.time() - time_start
+ seq_total_time += one_frametime
+ logger.info('Ref Frame: {}, Time: {}'.format(
+ imgname[0], one_frametime))
+
+ del (ref_embeddings)
+ del (ref_masks)
+ del (prev_embedding)
+ del (prev_mask)
+ del (seq_dataset)
+ del (seq_dataloader)
+
+ seq_avg_time_per_frame = seq_total_time / seq_total_frame
+ self.total_time += seq_total_time
+ self.total_frame += seq_total_frame
+ total_avg_time_per_frame = self.total_time / self.total_frame
+ self.total_sfps += seq_avg_time_per_frame
+ avg_sfps = self.total_sfps / (batch_id + 1)
+ logger.info("Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}".format(
+ seq_name, 1. / seq_avg_time_per_frame,
+ 1. / total_avg_time_per_frame, 1. / avg_sfps))
+
+ def flip_tensor(self, tensor, dim=0):
+ inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),
+ dtype="int64")
+ tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)
+ return tensor
+
+ def save_mask(self, mask_tensor, path):
+ _palette = [
+ 0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,
+ 0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,
+ 128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64,
+ 0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22,
+ 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27,
+ 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33,
+ 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39,
+ 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44, 44,
+ 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50,
+ 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,
+ 56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61,
+ 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67,
+ 67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73,
+ 73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78,
+ 79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83, 83, 84, 84,
+ 84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 90,
+ 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94, 95, 95, 95,
+ 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100, 100, 100, 101,
+ 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105, 105,
+ 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109,
+ 110, 110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114,
+ 114, 114, 115, 115, 115, 116, 116, 116, 117, 117, 117, 118, 118,
+ 118, 119, 119, 119, 120, 120, 120, 121, 121, 121, 122, 122, 122,
+ 123, 123, 123, 124, 124, 124, 125, 125, 125, 126, 126, 126, 127,
+ 127, 127, 128, 128, 128, 129, 129, 129, 130, 130, 130, 131, 131,
+ 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135, 135, 135,
+ 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,
+ 140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144,
+ 144, 145, 145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148,
+ 149, 149, 149, 150, 150, 150, 151, 151, 151, 152, 152, 152, 153,
+ 153, 153, 154, 154, 154, 155, 155, 155, 156, 156, 156, 157, 157,
+ 157, 158, 158, 158, 159, 159, 159, 160, 160, 160, 161, 161, 161,
+ 162, 162, 162, 163, 163, 163, 164, 164, 164, 165, 165, 165, 166,
+ 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170, 170,
+ 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174,
+ 175, 175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179,
+ 179, 179, 180, 180, 180, 181, 181, 181, 182, 182, 182, 183, 183,
+ 183, 184, 184, 184, 185, 185, 185, 186, 186, 186, 187, 187, 187,
+ 188, 188, 188, 189, 189, 189, 190, 190, 190, 191, 191, 191, 192,
+ 192, 192, 193, 193, 193, 194, 194, 194, 195, 195, 195, 196, 196,
+ 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200, 200, 200,
+ 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,
+ 205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209,
+ 209, 210, 210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213,
+ 214, 214, 214, 215, 215, 215, 216, 216, 216, 217, 217, 217, 218,
+ 218, 218, 219, 219, 219, 220, 220, 220, 221, 221, 221, 222, 222,
+ 222, 223, 223, 223, 224, 224, 224, 225, 225, 225, 226, 226, 226,
+ 227, 227, 227, 228, 228, 228, 229, 229, 229, 230, 230, 230, 231,
+ 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235, 235,
+ 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239,
+ 240, 240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244,
+ 244, 244, 245, 245, 245, 246, 246, 246, 247, 247, 247, 248, 248,
+ 248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252,
+ 253, 253, 253, 254, 254, 254, 255, 255, 255
+ ]
+ mask = mask_tensor.cpu().numpy().astype('uint8')
+ mask = Image.fromarray(mask).convert('P')
+ mask.putpalette(_palette)
+ mask.save(path)
+
+ def zip_folder(self, source_folder, zip_dir):
+ f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)
+ pre_len = len(os.path.dirname(source_folder))
+ for dirpath, dirnames, filenames in os.walk(source_folder):
+ for filename in filenames:
+ pathfile = os.path.join(dirpath, filename)
+ arcname = pathfile[pre_len:].strip(os.path.sep)
+ f.write(pathfile, arcname)
+ f.close()
+
+ def accumulate(self):
+ """accumulate metrics when finished all iters.
+ """
+ self.zip_folder(self.result_root, self.zip_dir)
+ logger.info('Save result to {}.'.format(self.zip_dir))
diff --git a/paddlevideo/metrics/youtube8m/__init__.py b/paddlevideo/metrics/youtube8m/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddlevideo/metrics/youtube8m/average_precision_calculator.py b/paddlevideo/metrics/youtube8m/average_precision_calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdbd6e0d04c4cbcef18ed89a38689541df2fcb6c
--- /dev/null
+++ b/paddlevideo/metrics/youtube8m/average_precision_calculator.py
@@ -0,0 +1,274 @@
+# Copyright 2020 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate or keep track of the interpolated average precision.
+
+It provides an interface for calculating interpolated average precision for an
+entire list or the top-n ranked items. For the definition of the
+(non-)interpolated average precision:
+http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
+
+Example usages:
+1) Use it as a static function call to directly calculate average precision for
+a short ranked list in the memory.
+
+```
+import random
+
+p = np.array([random.random() for _ in xrange(10)])
+a = np.array([random.choice([0, 1]) for _ in xrange(10)])
+
+ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
+```
+
+2) Use it as an object for long ranked list that cannot be stored in memory or
+the case where partial predictions can be observed at a time (Tensorflow
+predictions). In this case, we first call the function accumulate many times
+to process parts of the ranked list. After processing all the parts, we call
+peek_interpolated_ap_at_n.
+```
+p1 = np.array([random.random() for _ in xrange(5)])
+a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+p2 = np.array([random.random() for _ in xrange(5)])
+a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+
+# interpolated average precision at 10 using 1000 break points
+calculator = average_precision_calculator.AveragePrecisionCalculator(10)
+calculator.accumulate(p1, a1)
+calculator.accumulate(p2, a2)
+ap3 = calculator.peek_ap_at_n()
+```
+"""
+
+import heapq
+import random
+import numbers
+
+import numpy
+
+
+class AveragePrecisionCalculator(object):
+ """Calculate the average precision and average precision at n."""
+ def __init__(self, top_n=None):
+ """Construct an AveragePrecisionCalculator to calculate average precision.
+
+ This class is used to calculate the average precision for a single label.
+
+ Args:
+ top_n: A positive Integer specifying the average precision at n, or
+ None to use all provided data points.
+
+ Raises:
+ ValueError: An error occurred when the top_n is not a positive integer.
+ """
+ if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
+ raise ValueError("top_n must be a positive integer or None.")
+
+ self._top_n = top_n # average precision at n
+ self._total_positives = 0 # total number of positives have seen
+ self._heap = [] # max heap of (prediction, actual)
+
+ @property
+ def heap_size(self):
+ """Gets the heap size maintained in the class."""
+ return len(self._heap)
+
+ @property
+ def num_accumulated_positives(self):
+ """Gets the number of positive samples that have been accumulated."""
+ return self._total_positives
+
+ def accumulate(self, predictions, actuals, num_positives=None):
+ """Accumulate the predictions and their ground truth labels.
+
+ After the function call, we may call peek_ap_at_n to actually calculate
+ the average precision.
+ Note predictions and actuals must have the same shape.
+
+ Args:
+ predictions: a list storing the prediction scores.
+ actuals: a list storing the ground truth labels. Any value
+ larger than 0 will be treated as positives, otherwise as negatives.
+ num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
+ then it's possible some true positives were missed in them. In that case,
+ you can provide 'num_positives' in order to accurately track recall.
+
+ Raises:
+ ValueError: An error occurred when the format of the input is not the
+ numpy 1-D array or the shape of predictions and actuals does not match.
+ """
+ if len(predictions) != len(actuals):
+ raise ValueError(
+ "the shape of predictions and actuals does not match.")
+
+ if not num_positives is None:
+ if not isinstance(num_positives,
+ numbers.Number) or num_positives < 0:
+ raise ValueError(
+ "'num_positives' was provided but it wan't a nonzero number."
+ )
+
+ if not num_positives is None:
+ self._total_positives += num_positives
+ else:
+ self._total_positives += numpy.size(numpy.where(actuals > 0))
+ topk = self._top_n
+ heap = self._heap
+
+ for i in range(numpy.size(predictions)):
+ if topk is None or len(heap) < topk:
+ heapq.heappush(heap, (predictions[i], actuals[i]))
+ else:
+ if predictions[i] > heap[0][0]: # heap[0] is the smallest
+ heapq.heappop(heap)
+ heapq.heappush(heap, (predictions[i], actuals[i]))
+
+ def clear(self):
+ """Clear the accumulated predictions."""
+ self._heap = []
+ self._total_positives = 0
+
+ def peek_ap_at_n(self):
+ """Peek the non-interpolated average precision at n.
+
+ Returns:
+ The non-interpolated average precision at n (default 0).
+ If n is larger than the length of the ranked list,
+ the average precision will be returned.
+ """
+ if self.heap_size <= 0:
+ return 0
+ predlists = numpy.array(list(zip(*self._heap)))
+
+ ap = self.ap_at_n(predlists[0],
+ predlists[1],
+ n=self._top_n,
+ total_num_positives=self._total_positives)
+ return ap
+
+ @staticmethod
+ def ap(predictions, actuals):
+ """Calculate the non-interpolated average precision.
+
+ Args:
+ predictions: a numpy 1-D array storing the sparse prediction scores.
+ actuals: a numpy 1-D array storing the ground truth labels. Any value
+ larger than 0 will be treated as positives, otherwise as negatives.
+
+ Returns:
+ The non-interpolated average precision at n.
+ If n is larger than the length of the ranked list,
+ the average precision will be returned.
+
+ Raises:
+ ValueError: An error occurred when the format of the input is not the
+ numpy 1-D array or the shape of predictions and actuals does not match.
+ """
+ return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)
+
+ @staticmethod
+ def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
+ """Calculate the non-interpolated average precision.
+
+ Args:
+ predictions: a numpy 1-D array storing the sparse prediction scores.
+ actuals: a numpy 1-D array storing the ground truth labels. Any value
+ larger than 0 will be treated as positives, otherwise as negatives.
+ n: the top n items to be considered in ap@n.
+ total_num_positives : (optionally) you can specify the number of total
+ positive
+ in the list. If specified, it will be used in calculation.
+
+ Returns:
+ The non-interpolated average precision at n.
+ If n is larger than the length of the ranked list,
+ the average precision will be returned.
+
+ Raises:
+ ValueError: An error occurred when
+ 1) the format of the input is not the numpy 1-D array;
+ 2) the shape of predictions and actuals does not match;
+ 3) the input n is not a positive integer.
+ """
+ if len(predictions) != len(actuals):
+ raise ValueError(
+ "the shape of predictions and actuals does not match.")
+
+ if n is not None:
+ if not isinstance(n, int) or n <= 0:
+ raise ValueError("n must be 'None' or a positive integer."
+ " It was '%s'." % n)
+
+ ap = 0.0
+
+ predictions = numpy.array(predictions)
+ actuals = numpy.array(actuals)
+
+ # add a shuffler to avoid overestimating the ap
+ predictions, actuals = AveragePrecisionCalculator._shuffle(
+ predictions, actuals)
+ sortidx = sorted(range(len(predictions)),
+ key=lambda k: predictions[k],
+ reverse=True)
+
+ if total_num_positives is None:
+ numpos = numpy.size(numpy.where(actuals > 0))
+ else:
+ numpos = total_num_positives
+
+ if numpos == 0:
+ return 0
+
+ if n is not None:
+ numpos = min(numpos, n)
+ delta_recall = 1.0 / numpos
+ poscount = 0.0
+
+ # calculate the ap
+ r = len(sortidx)
+ if n is not None:
+ r = min(r, n)
+ for i in range(r):
+ if actuals[sortidx[i]] > 0:
+ poscount += 1
+ ap += poscount / (i + 1) * delta_recall
+ return ap
+
+ @staticmethod
+ def _shuffle(predictions, actuals):
+ random.seed(0)
+ suffidx = random.sample(range(len(predictions)), len(predictions))
+ predictions = predictions[suffidx]
+ actuals = actuals[suffidx]
+ return predictions, actuals
+
+ @staticmethod
+ def _zero_one_normalize(predictions, epsilon=1e-7):
+ """Normalize the predictions to the range between 0.0 and 1.0.
+
+ For some predictions like SVM predictions, we need to normalize them before
+ calculate the interpolated average precision. The normalization will not
+ change the rank in the original list and thus won't change the average
+ precision.
+
+ Args:
+ predictions: a numpy 1-D array storing the sparse prediction scores.
+ epsilon: a small constant to avoid denominator being zero.
+
+ Returns:
+ The normalized prediction.
+ """
+ denominator = numpy.max(predictions) - numpy.min(predictions)
+ ret = (predictions - numpy.min(predictions)) / numpy.max(
+ denominator, epsilon)
+ return ret
diff --git a/paddlevideo/metrics/youtube8m/eval_util.py b/paddlevideo/metrics/youtube8m/eval_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..abcf0d8f2d8fec5363ceac5c8c6b581a31683f7a
--- /dev/null
+++ b/paddlevideo/metrics/youtube8m/eval_util.py
@@ -0,0 +1,198 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides functions to help with evaluating models."""
+import numpy as np
+import paddle
+from paddlevideo.utils import get_logger
+
+from ..base import BaseMetric
+from ..registry import METRIC
+from . import average_precision_calculator as ap_calculator
+from . import mean_average_precision_calculator as map_calculator
+
+logger = get_logger("paddlevideo")
+
+
+def flatten(l):
+ """ Merges a list of lists into a single list. """
+ return [item for sublist in l for item in sublist]
+
+
+def calculate_hit_at_one(predictions, actuals):
+ """Performs a local (numpy) calculation of the hit at one.
+
+ Args:
+ predictions: Matrix containing the outputs of the model.
+ Dimensions are 'batch' x 'num_classes'.
+ actuals: Matrix containing the ground truth labels.
+ Dimensions are 'batch' x 'num_classes'.
+
+ Returns:
+ float: The average hit at one across the entire batch.
+ """
+ top_prediction = np.argmax(predictions, 1)
+ hits = actuals[np.arange(actuals.shape[0]), top_prediction]
+ return np.mean(hits)
+
+
+def calculate_precision_at_equal_recall_rate(predictions, actuals):
+ """Performs a local (numpy) calculation of the PERR.
+
+ Args:
+ predictions: Matrix containing the outputs of the model.
+ Dimensions are 'batch' x 'num_classes'.
+ actuals: Matrix containing the ground truth labels.
+ Dimensions are 'batch' x 'num_classes'.
+
+ Returns:
+ float: The average precision at equal recall rate across the entire batch.
+ """
+ aggregated_precision = 0.0
+ num_videos = actuals.shape[0]
+ for row in np.arange(num_videos):
+ num_labels = int(np.sum(actuals[row]))
+ top_indices = np.argpartition(predictions[row],
+ -num_labels)[-num_labels:]
+ item_precision = 0.0
+ for label_index in top_indices:
+ if predictions[row][label_index] > 0:
+ item_precision += actuals[row][label_index]
+ item_precision /= top_indices.size
+ aggregated_precision += item_precision
+ aggregated_precision /= num_videos
+ return aggregated_precision
+
+
+def calculate_gap(predictions, actuals, top_k=20):
+ """Performs a local (numpy) calculation of the global average precision.
+
+ Only the top_k predictions are taken for each of the videos.
+
+ Args:
+ predictions: Matrix containing the outputs of the model.
+ Dimensions are 'batch' x 'num_classes'.
+ actuals: Matrix containing the ground truth labels.
+ Dimensions are 'batch' x 'num_classes'.
+ top_k: How many predictions to use per video.
+
+ Returns:
+ float: The global average precision.
+ """
+ gap_calculator = ap_calculator.AveragePrecisionCalculator()
+ sparse_predictions, sparse_labels, num_positives = top_k_by_class(
+ predictions, actuals, top_k)
+ gap_calculator.accumulate(flatten(sparse_predictions),
+ flatten(sparse_labels), sum(num_positives))
+ return gap_calculator.peek_ap_at_n()
+
+
+def top_k_by_class(predictions, labels, k=20):
+ """Extracts the top k predictions for each video, sorted by class.
+
+ Args:
+ predictions: A numpy matrix containing the outputs of the model.
+ Dimensions are 'batch' x 'num_classes'.
+ k: the top k non-zero entries to preserve in each prediction.
+
+ Returns:
+ A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
+ are lists of lists of floats. 'true_positives' is a list of scalars. The
+ length of the lists are equal to the number of classes. The entries in the
+ predictions variable are probability predictions, and
+ the corresponding entries in the labels variable are the ground truth for
+ those predictions. The entries in 'true_positives' are the number of true
+ positives for each class in the ground truth.
+
+ Raises:
+ ValueError: An error occurred when the k is not a positive integer.
+ """
+ if k <= 0:
+ raise ValueError("k must be a positive integer.")
+ k = min(k, predictions.shape[1])
+ num_classes = predictions.shape[1]
+ prediction_triplets = []
+ for video_index in range(predictions.shape[0]):
+ prediction_triplets.extend(
+ top_k_triplets(predictions[video_index], labels[video_index], k))
+ out_predictions = [[] for v in range(num_classes)]
+ out_labels = [[] for v in range(num_classes)]
+ for triplet in prediction_triplets:
+ out_predictions[triplet[0]].append(triplet[1])
+ out_labels[triplet[0]].append(triplet[2])
+ out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)]
+
+ return out_predictions, out_labels, out_true_positives
+
+
+def top_k_triplets(predictions, labels, k=20):
+ """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
+ (prediction, class) format"""
+ m = len(predictions)
+ k = min(k, m)
+ indices = np.argpartition(predictions, -k)[-k:]
+ return [(index, predictions[index], labels[index]) for index in indices]
+
+
+@METRIC.register
+class HitOneMetric(BaseMetric):
+ """A class to store the evaluation metrics."""
+ def __init__(self,
+ num_class,
+ top_k,
+ data_size,
+ batch_size,
+ log_interval=20):
+ """Construct an HitOneMetric object to store the evaluation metrics."""
+ self.hit_at_one = []
+ self.perr = []
+ self.gap = []
+ super().__init__(data_size, batch_size, log_interval)
+
+ def accumulate(self):
+ logger.info(
+ '[TEST] finished, hit_at_one = {:.5f}, perr = {:.5f}, gap = {:.5f}'.
+ format(np.mean(np.array(self.hit_at_one)),
+ np.mean(np.array(self.perr)), np.mean(np.array(self.gap))))
+
+ def clear(self):
+ """Clear the evaluation metrics and reset the HitOneMetric object."""
+ self.hit_at_one = []
+ self.perr = []
+ self.gap = []
+
+ def update(self, batch_id, data, outputs):
+ """update metrics during each iter
+ """
+ hit_at_one = paddle.to_tensor(outputs['hit_at_one'])
+ perr = paddle.to_tensor(outputs['perr'])
+ gap = paddle.to_tensor(outputs['gap'])
+ # NOTE(shipping): deal with multi cards validate
+ if self.world_size > 1:
+ hit_at_one = paddle.distributed.all_reduce(
+ hit_at_one,
+ op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ perr = paddle.distributed.all_reduce(
+ perr, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+ gap = paddle.distributed.all_reduce(
+ gap, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+ self.hit_at_one.append(hit_at_one.numpy())
+ self.perr.append(perr.numpy())
+ self.gap.append(gap.numpy())
+ # preds ensemble
+ if batch_id % self.log_interval == 0:
+ logger.info("[TEST] Processing batch {}/{}...".format(
+ batch_id,
+ self.data_size // (self.batch_size * self.world_size),
+ ))
diff --git a/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py b/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ae8b0ed3717aba13b7ed35b4af025be40423967
--- /dev/null
+++ b/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py
@@ -0,0 +1,114 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate the mean average precision.
+
+It provides an interface for calculating mean average precision
+for an entire list or the top-n ranked items.
+
+Example usages:
+We first call the function accumulate many times to process parts of the ranked
+list. After processing all the parts, we call peek_map_at_n
+to calculate the mean average precision.
+
+```
+import random
+
+p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
+a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
+ for _ in xrange(1000)])
+
+# mean average precision for 50 classes.
+calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
+ num_class=50)
+calculator.accumulate(p, a)
+aps = calculator.peek_map_at_n()
+```
+"""
+
+import numpy
+from . import average_precision_calculator
+
+
+class MeanAveragePrecisionCalculator(object):
+ """This class is to calculate mean average precision.
+ """
+
+ def __init__(self, num_class):
+ """Construct a calculator to calculate the (macro) average precision.
+
+ Args:
+ num_class: A positive Integer specifying the number of classes.
+ top_n_array: A list of positive integers specifying the top n for each
+ class. The top n in each class will be used to calculate its average
+ precision at n.
+ The size of the array must be num_class.
+
+ Raises:
+ ValueError: An error occurred when num_class is not a positive integer;
+ or the top_n_array is not a list of positive integers.
+ """
+ if not isinstance(num_class, int) or num_class <= 1:
+ raise ValueError("num_class must be a positive integer.")
+
+ self._ap_calculators = [] # member of AveragePrecisionCalculator
+ self._num_class = num_class # total number of classes
+ for i in range(num_class):
+ self._ap_calculators.append(
+ average_precision_calculator.AveragePrecisionCalculator())
+
+ def accumulate(self, predictions, actuals, num_positives=None):
+ """Accumulate the predictions and their ground truth labels.
+
+ Args:
+ predictions: A list of lists storing the prediction scores. The outer
+ dimension corresponds to classes.
+ actuals: A list of lists storing the ground truth labels. The dimensions
+ should correspond to the predictions input. Any value
+ larger than 0 will be treated as positives, otherwise as negatives.
+ num_positives: If provided, it is a list of numbers representing the
+ number of true positives for each class. If not provided, the number of
+ true positives will be inferred from the 'actuals' array.
+
+ Raises:
+ ValueError: An error occurred when the shape of predictions and actuals
+ does not match.
+ """
+ if not num_positives:
+ num_positives = [None for i in predictions.shape[1]]
+
+ calculators = self._ap_calculators
+ for i in range(len(predictions)):
+ calculators[i].accumulate(predictions[i], actuals[i],
+ num_positives[i])
+
+ def clear(self):
+ for calculator in self._ap_calculators:
+ calculator.clear()
+
+ def is_empty(self):
+ return ([calculator.heap_size for calculator in self._ap_calculators] ==
+ [0 for _ in range(self._num_class)])
+
+ def peek_map_at_n(self):
+ """Peek the non-interpolated mean average precision at n.
+
+ Returns:
+ An array of non-interpolated average precision at n (default 0) for each
+ class.
+ """
+ aps = [
+ self._ap_calculators[i].peek_ap_at_n()
+ for i in range(self._num_class)
+ ]
+ return aps
diff --git a/paddlevideo/modeling/__init__.py b/paddlevideo/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..639bd340388389a55da37ba665f4952c9a184dd8
--- /dev/null
+++ b/paddlevideo/modeling/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .assigners import MaxIoUAssignerAVA
+from .backbones import ResNet
+from .builder import (build_backbone, build_head, build_localizer, build_loss,
+ build_recognizer)
+from .framework.detectors import BaseDetector, FastRCNN, TwoStageDetector
+from .framework.recognizers import BaseRecognizer, Recognizer2D
+from .heads import (AVARoIHead, BaseHead, BBoxHeadAVA, SingleRoIExtractor3D,
+ TSNHead)
+from .losses import CrossEntropyLoss
+from .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,
+ PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+from .samplers import RandomSampler
+from .weight_init import kaiming_normal_, trunc_normal_, weight_init_
+
+__all__ = [
+ 'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS', 'LOSSES',
+ 'build_recognizer', 'build_localizer', 'build_head', 'build_backbone',
+ 'build_loss', 'ResNet', 'TSNHead', 'BaseHead', 'BaseRecognizer',
+ 'Recognizer2d', 'CrossEntropyLoss', 'ROI_EXTRACTORS',
+ 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'MaxIoUAssignerAVA',
+ 'RandomSampler', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',
+ 'weight_init_'
+]
diff --git a/paddlevideo/modeling/assigners/__init__.py b/paddlevideo/modeling/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4570db2f0c073ece92c493de4851d8863f35022
--- /dev/null
+++ b/paddlevideo/modeling/assigners/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .max_iou_assigner_ava import MaxIoUAssignerAVA
+
+__all__ = ['MaxIoUAssignerAVA']
diff --git a/paddlevideo/modeling/assigners/max_iou_assigner_ava.py b/paddlevideo/modeling/assigners/max_iou_assigner_ava.py
new file mode 100644
index 0000000000000000000000000000000000000000..2515c858bd14e964782dd9cff4584ce5640f302e
--- /dev/null
+++ b/paddlevideo/modeling/assigners/max_iou_assigner_ava.py
@@ -0,0 +1,148 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import numpy as np
+from ..registry import BBOX_ASSIGNERS
+from ..bbox_utils import bbox_overlaps
+
+class AssignResult():
+ def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+ self.num_gts = num_gts
+ self.gt_inds = gt_inds
+ self.max_overlaps = max_overlaps
+ self.labels = labels
+
+ def add_gt_(self, gt_labels):
+ """Add ground truth as assigned results. """
+ self_inds = paddle.arange(1, len(gt_labels) + 1, dtype="int32")
+ gt_inds_squeeze = paddle.squeeze(self.gt_inds, axis=0)
+ self.gt_inds = paddle.concat([self_inds, gt_inds_squeeze])
+ gt_label_ones = paddle.full((len(gt_labels), ), 1, dtype='float32')
+ max_overlaps_squeeze = paddle.squeeze(self.max_overlaps, axis=0)
+ self.max_overlaps = paddle.concat([gt_label_ones, max_overlaps_squeeze])
+ if self.labels is not None:
+ self.labels = paddle.concat([gt_labels, self.labels])
+
+@BBOX_ASSIGNERS.register()
+class MaxIoUAssignerAVA():
+ """Assign a corresponding gt bbox or background to each bbox. """
+ def __init__(self,
+ pos_iou_thr,
+ neg_iou_thr,
+ min_pos_iou=.0,
+ gt_max_assign_all=True,
+ ignore_iof_thr=-1,
+ ignore_wrt_candidates=True,
+ match_low_quality=True,
+ gpu_assign_thr=-1,
+ iou_calculator=dict(type='BboxOverlaps2D')):
+ self.pos_iou_thr = pos_iou_thr
+ self.neg_iou_thr = neg_iou_thr
+ self.min_pos_iou = min_pos_iou
+ self.gt_max_assign_all = gt_max_assign_all
+ self.ignore_iof_thr = ignore_iof_thr
+ self.ignore_wrt_candidates = ignore_wrt_candidates
+ self.gpu_assign_thr = gpu_assign_thr
+ self.match_low_quality = match_low_quality
+
+ def assign(self,
+ bboxes,
+ gt_bboxes,
+ gt_labels=None):
+ """Assign gt to bboxes. """
+ overlaps = bbox_overlaps(gt_bboxes, bboxes)
+ assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+ return assign_result
+
+ def assign_wrt_overlaps(self, overlaps, gt_labels=None):
+ """Assign w.r.t. the overlaps of bboxes with gts. """
+ num_gts, num_bboxes = overlaps.shape[0], overlaps.shape[1]
+ # 1. assign -1
+ assigned_gt_inds = paddle.full((num_bboxes, ), -1, dtype='int32')
+
+ # for each anchor, which gt best overlaps with it
+ # for each anchor, the max iou of all gts
+ max_overlaps, argmax_overlaps = paddle.topk(overlaps, k=1, axis=0)
+ # for each gt, which anchor best overlaps with it
+ # for each gt, the max iou of all proposals
+ gt_max_overlaps, gt_argmax_overlaps = paddle.topk(overlaps, k=1, axis=1)
+
+ # 2. assign negative: below the negative inds are set to be 0
+ match_labels = paddle.full(argmax_overlaps.shape, -1, dtype='int32')
+ match_labels = paddle.where(max_overlaps < self.neg_iou_thr,
+ paddle.zeros_like(match_labels), match_labels)
+
+ # 3. assign positive: above positive IoU threshold
+ argmax_overlaps_int32 = paddle.cast(argmax_overlaps, 'int32')
+ match_labels = paddle.where(max_overlaps >= self.pos_iou_thr,
+ argmax_overlaps_int32 + 1, match_labels)
+ assigned_gt_inds = match_labels
+ if self.match_low_quality:
+ # Low-quality matching will overwirte the assigned_gt_inds
+ # assigned in Step 3. Thus, the assigned gt might not be the
+ # best one for prediction.
+ # For example, if bbox A has 0.9 and 0.8 iou with GT bbox
+ # 1 & 2, bbox 1 will be assigned as the best target for bbox A
+ # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A,
+ # bbox A's assigned_gt_inds will be overwritten to be bbox B.
+ # This might be the reason that it is not used in ROI Heads.
+ for i in range(num_gts):
+ if gt_max_overlaps.numpy()[i] >= self.min_pos_iou:
+ if self.gt_max_assign_all:
+ equal_x_np = overlaps[i, :].numpy()
+ equal_y_np = gt_max_overlaps[i].numpy()
+ max_iou_inds = np.equal(equal_x_np, equal_y_np)
+ max_iou_inds = paddle.to_tensor(max_iou_inds)
+ max_iou_inds = paddle.reshape( max_iou_inds, [1,max_iou_inds.shape[0]] )
+ match_labels_gts = paddle.full(max_iou_inds.shape, i+1, dtype='int32')
+ match_labels = paddle.where(max_iou_inds, match_labels_gts, match_labels)
+ assigned_gt_inds = match_labels
+ else:
+ assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+ if gt_labels is not None:
+ # consider multi-class case (AVA)
+ assert len(gt_labels[0]) > 1
+ assigned_labels = paddle.full([num_bboxes, len(gt_labels[0])], 0, dtype='float32')
+ assigned_gt_inds_reshape = assigned_gt_inds.reshape([assigned_gt_inds.shape[1]])
+ pos_inds = paddle.nonzero( assigned_gt_inds_reshape , as_tuple=False)
+ pos_inds_num = paddle.numel(pos_inds).numpy()[0]
+ if pos_inds_num > 0:
+ pos_inds = paddle.squeeze(pos_inds, axis = 1 )
+ assigned_gt_inds_squeeze = paddle.squeeze(assigned_gt_inds, axis=0)
+ assigned_gt_inds_select = paddle.index_select(assigned_gt_inds_squeeze, pos_inds) - 1
+ gt_labels_select = paddle.index_select(gt_labels, assigned_gt_inds_select)
+ A = assigned_gt_inds_squeeze
+ X = assigned_gt_inds_squeeze - 1
+ Y = paddle.zeros_like(X)
+ if A.shape[0]==1:
+ if A.numpy()[0]>0:
+ T=X
+ else:
+ T=Y
+ else:
+ T = paddle.where(A>0, X, Y)
+ S = paddle.index_select(gt_labels, T)
+ AE = paddle.expand(A, [S.shape[1], A.shape[0]])
+ AET = paddle.transpose(AE, perm=[1, 0])
+ R = paddle.where(AET>0, S, assigned_labels)
+ assigned_labels = R
+ else:
+ assigned_labels = None
+ ret = AssignResult(
+ num_gts,
+ assigned_gt_inds,
+ max_overlaps,
+ labels=assigned_labels)
+ return ret
diff --git a/paddlevideo/modeling/backbones/__init__.py b/paddlevideo/modeling/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4374a6228b72d2137b578d7c5bdb0bcb156ef675
--- /dev/null
+++ b/paddlevideo/modeling/backbones/__init__.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .actbert import BertForMultiModalPreTraining
+from .adds import ADDS_DepthNet
+from .agcn import AGCN
+from .asrf import ASRF
+from .bmn import BMN
+from .cfbi import CFBI
+from .movinet import MoViNet
+from .ms_tcn import MSTCN
+from .resnet import ResNet
+from .resnet_slowfast import ResNetSlowFast
+from .resnet_slowfast_MRI import ResNetSlowFast_MRI
+from .resnet_tsm import ResNetTSM
+from .resnet_tsm_MRI import ResNetTSM_MRI
+from .resnet_tsn_MRI import ResNetTSN_MRI
+from .resnet_tweaks_tsm import ResNetTweaksTSM
+from .resnet_tweaks_tsn import ResNetTweaksTSN
+from .stgcn import STGCN
+from .swin_transformer import SwinTransformer3D
+from .transnetv2 import TransNetV2
+from .vit import VisionTransformer
+from .vit_tweaks import VisionTransformer_tweaks
+from .ms_tcn import MSTCN
+from .asrf import ASRF
+from .resnet_tsn_MRI import ResNetTSN_MRI
+from .resnet_tsm_MRI import ResNetTSM_MRI
+from .resnet_slowfast_MRI import ResNetSlowFast_MRI
+from .cfbi import CFBI
+from .ctrgcn import CTRGCN
+from .movinet import MoViNet
+
+__all__ = [
+ 'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN',
+ 'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2',
+ 'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining',
+ 'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN',
+ 'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN'
+]
diff --git a/paddlevideo/modeling/backbones/actbert.py b/paddlevideo/modeling/backbones/actbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbee1fd8c6961382670ff996d254043bd823b18b
--- /dev/null
+++ b/paddlevideo/modeling/backbones/actbert.py
@@ -0,0 +1,1158 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+import math
+import copy
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout)
+from paddle.nn.initializer import Constant, Normal
+from ...utils.save_load import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+ACT2FN = {"gelu": F.gelu, "relu": F.relu, "swish": F.swish}
+
+
+class BertEmbeddings(nn.Layer):
+ """Construct the embeddings from word, position and token_type embeddings.
+ """
+ def __init__(self, vocab_size, max_position_embeddings, type_vocab_size,
+ hidden_size, hidden_dropout_prob):
+ super(BertEmbeddings, self).__init__()
+ self.word_embeddings = nn.Embedding(vocab_size,
+ hidden_size,
+ padding_idx=0)
+ self.position_embeddings = nn.Embedding(max_position_embeddings,
+ hidden_size)
+ self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
+
+ self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+ self.dropout = nn.Dropout(hidden_dropout_prob)
+
+ def forward(self, input_ids, token_type_ids=None):
+ seq_length = input_ids.shape[1]
+ position_ids = paddle.arange(end=seq_length, dtype="int64")
+ position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+ if token_type_ids is None:
+ token_type_ids = paddle.zeros_like(input_ids)
+
+ words_embeddings = self.word_embeddings(input_ids) #8,36 -> 8,36,768
+ position_embeddings = self.position_embeddings(
+ position_ids) #8,36 -> 8,36,768
+ token_type_embeddings = self.token_type_embeddings(
+ token_type_ids) #8,36 -> 8,36,768
+
+ embeddings = words_embeddings + position_embeddings + token_type_embeddings
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class BertImageEmbeddings(nn.Layer):
+ def __init__(self, v_feature_size, v_hidden_size, v_hidden_dropout_prob):
+ super(BertImageEmbeddings, self).__init__()
+ self.image_embeddings = nn.Linear(v_feature_size, v_hidden_size)
+ self.image_location_embeddings = nn.Linear(5, v_hidden_size)
+ self.LayerNorm = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
+ self.dropout = nn.Dropout(v_hidden_dropout_prob)
+
+ def forward(self, input_ids, input_loc):
+ img_embeddings = self.image_embeddings(
+ input_ids) #8,37,2048 -> 8,37,1024
+ loc_embeddings = self.image_location_embeddings(
+ input_loc) #8,37,5 -> 8,37,1024
+ embeddings = self.LayerNorm(img_embeddings + loc_embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings # shape: bs*seq_len*hs
+
+
+class BertActionEmbeddings(nn.Layer):
+ def __init__(self, a_feature_size, a_hidden_size, a_hidden_dropout_prob):
+ super(BertActionEmbeddings, self).__init__()
+ self.action_embeddings = nn.Linear(a_feature_size, a_hidden_size)
+ self.LayerNorm = nn.LayerNorm(a_hidden_size, epsilon=1e-12)
+ self.dropout = nn.Dropout(a_hidden_dropout_prob)
+
+ def forward(self, input_ids):
+ action_embeddings = self.action_embeddings(
+ input_ids) #8,5,2048 -> 8,5,768
+ embeddings = self.LayerNorm(action_embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class BertSelfAttention(nn.Layer):
+ def __init__(self, hidden_size, num_attention_heads,
+ attention_probs_dropout_prob):
+ super(BertSelfAttention, self).__init__()
+ if hidden_size % num_attention_heads != 0:
+ raise ValueError(
+ "The hidden size (%d) is not a multiple of the number of attention "
+ "heads (%d)" % (hidden_size, num_attention_heads))
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_size = int(hidden_size / num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(hidden_size, self.all_head_size)
+ self.key = nn.Linear(hidden_size, self.all_head_size)
+ self.value = nn.Linear(hidden_size, self.all_head_size)
+
+ self.dropout = nn.Dropout(attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.shape[:-1] + [
+ self.num_attention_heads,
+ self.attention_head_size,
+ ]
+ x = x.reshape(new_x_shape)
+ return x.transpose((0, 2, 1, 3))
+
+ def forward(self, hidden_states, attention_mask):
+ mixed_query_layer = self.query(hidden_states)
+ mixed_key_layer = self.key(hidden_states)
+ mixed_value_layer = self.value(hidden_states)
+
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+ key_layer = self.transpose_for_scores(mixed_key_layer)
+ value_layer = self.transpose_for_scores(mixed_value_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = paddle.matmul(query_layer,
+ key_layer.transpose((0, 1, 3, 2)))
+ attention_scores = attention_scores / math.sqrt(
+ self.attention_head_size)
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs)
+
+ context_layer = paddle.matmul(attention_probs, value_layer)
+ context_layer = context_layer.transpose((0, 2, 1, 3))
+ new_context_layer_shape = context_layer.shape[:-2] + [
+ self.all_head_size
+ ]
+ context_layer = context_layer.reshape(new_context_layer_shape)
+
+ return context_layer, attention_probs
+
+
+class BertSelfOutput(nn.Layer):
+ def __init__(self, hidden_size, hidden_dropout_prob):
+ super(BertSelfOutput, self).__init__()
+ self.dense = nn.Linear(hidden_size, hidden_size)
+ self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+ self.dropout = nn.Dropout(hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertAttention(nn.Layer):
+ def __init__(self, hidden_size, hidden_dropout_prob, num_attention_heads,
+ attention_probs_dropout_prob):
+ super(BertAttention, self).__init__()
+ self.self = BertSelfAttention(hidden_size, num_attention_heads,
+ attention_probs_dropout_prob)
+ self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)
+
+ def forward(self, input_tensor, attention_mask):
+ self_output, attention_probs = self.self(input_tensor, attention_mask)
+ attention_output = self.output(self_output, input_tensor)
+ return attention_output, attention_probs
+
+
+class BertIntermediate(nn.Layer):
+ def __init__(self, hidden_size, intermediate_size, hidden_act):
+ super(BertIntermediate, self).__init__()
+ self.dense = nn.Linear(hidden_size, intermediate_size)
+ if isinstance(hidden_act, str) or (sys.version_info[0] == 2
+ and isinstance(hidden_act, str)):
+ self.intermediate_act_fn = ACT2FN[hidden_act]
+ else:
+ self.intermediate_act_fn = hidden_act
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class BertOutput(nn.Layer):
+ def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob):
+ super(BertOutput, self).__init__()
+ self.dense = nn.Linear(intermediate_size, hidden_size)
+ self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+ self.dropout = nn.Dropout(hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertEntAttention(nn.Layer):
+ """Core mudule of tangled transformer.
+ """
+ def __init__(
+ self,
+ hidden_size,
+ v_hidden_size,
+ a_hidden_size,
+ bi_hidden_size,
+ attention_probs_dropout_prob,
+ v_attention_probs_dropout_prob,
+ a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob,
+ at_attention_probs_dropout_prob,
+ bi_num_attention_heads,
+ ):
+ super(BertEntAttention, self).__init__()
+ if bi_hidden_size % bi_num_attention_heads != 0:
+ raise ValueError(
+ "The hidden size (%d) is not a multiple of the number of attention "
+ "heads (%d)" % (bi_hidden_size, bi_num_attention_heads))
+
+ self.num_attention_heads = bi_num_attention_heads
+ self.attention_head_size = int(bi_hidden_size / bi_num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ # self attention layers for vision input
+ self.query1 = nn.Linear(v_hidden_size, self.all_head_size)
+ self.key1 = nn.Linear(v_hidden_size, self.all_head_size)
+ self.value1 = nn.Linear(v_hidden_size, self.all_head_size)
+ self.dropout1 = nn.Dropout(v_attention_probs_dropout_prob)
+
+ # self attention layers for text input
+ self.query2 = nn.Linear(hidden_size, self.all_head_size)
+ self.key2 = nn.Linear(hidden_size, self.all_head_size)
+ self.value2 = nn.Linear(hidden_size, self.all_head_size)
+ self.dropout2 = nn.Dropout(attention_probs_dropout_prob)
+
+ # self attention layers for action input
+ self.query3 = nn.Linear(a_hidden_size, self.all_head_size)
+ self.key3 = nn.Linear(a_hidden_size, self.all_head_size)
+ self.value3 = nn.Linear(a_hidden_size, self.all_head_size)
+ self.dropout3 = nn.Dropout(a_attention_probs_dropout_prob)
+
+ # self attention layers for action_text
+ self.key_at = nn.Linear(bi_hidden_size, self.all_head_size)
+ self.value_at = nn.Linear(bi_hidden_size, self.all_head_size)
+ self.dropout_at = nn.Dropout(av_attention_probs_dropout_prob)
+
+ # self attention layers for action_vision
+ self.key_av = nn.Linear(bi_hidden_size, self.all_head_size)
+ self.value_av = nn.Linear(bi_hidden_size, self.all_head_size)
+ self.dropout_av = nn.Dropout(at_attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.shape[:-1] + [
+ self.num_attention_heads,
+ self.attention_head_size,
+ ]
+ x = x.reshape(new_x_shape)
+ return x.transpose((0, 2, 1, 3))
+
+ def forward(
+ self,
+ input_tensor1,
+ attention_mask1,
+ input_tensor2,
+ attention_mask2,
+ input_tensor3,
+ attention_mask3,
+ ):
+
+ # for vision input.
+ mixed_query_layer1 = self.query1(input_tensor1)
+ mixed_key_layer1 = self.key1(input_tensor1)
+ mixed_value_layer1 = self.value1(input_tensor1)
+
+ query_layer1 = self.transpose_for_scores(mixed_query_layer1)
+ key_layer1 = self.transpose_for_scores(mixed_key_layer1)
+ value_layer1 = self.transpose_for_scores(mixed_value_layer1)
+
+ # for text input:
+ mixed_query_layer2 = self.query2(input_tensor2)
+ mixed_key_layer2 = self.key2(input_tensor2)
+ mixed_value_layer2 = self.value2(input_tensor2)
+
+ query_layer2 = self.transpose_for_scores(mixed_query_layer2)
+ key_layer2 = self.transpose_for_scores(mixed_key_layer2)
+ value_layer2 = self.transpose_for_scores(mixed_value_layer2)
+
+ # for action input:
+ mixed_query_layer3 = self.query3(input_tensor3)
+ mixed_key_layer3 = self.key3(input_tensor3)
+ mixed_value_layer3 = self.value3(input_tensor3)
+
+ query_layer3 = self.transpose_for_scores(mixed_query_layer3)
+ key_layer3 = self.transpose_for_scores(mixed_key_layer3)
+ value_layer3 = self.transpose_for_scores(mixed_value_layer3)
+
+ def do_attention(query_layer, key_layer, value_layer, attention_mask,
+ dropout):
+ """ compute attention """
+ attention_scores = paddle.matmul(query_layer,
+ key_layer.transpose((0, 1, 3, 2)))
+ attention_scores = attention_scores / math.sqrt(
+ self.attention_head_size)
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = dropout(attention_probs)
+
+ context_layer = paddle.matmul(attention_probs, value_layer)
+ context_layer = context_layer.transpose((0, 2, 1, 3))
+ new_context_layer_shape = context_layer.shape[:-2] + [
+ self.all_head_size
+ ]
+ context_layer = context_layer.reshape(new_context_layer_shape)
+ return context_layer
+
+ context_av = do_attention(query_layer3, key_layer1, value_layer1,
+ attention_mask1, self.dropout_av)
+ context_at = do_attention(query_layer3, key_layer2, value_layer2,
+ attention_mask2, self.dropout_at)
+
+ context_key_av = self.key_av(context_av).transpose((0, 2, 1))
+ # interpolate only support 4-D tensor now.
+ context_key_av = F.interpolate(context_key_av.unsqueeze(-1),
+ size=(key_layer2.shape[2],
+ 1)).squeeze(-1)
+ context_key_av = self.transpose_for_scores(
+ context_key_av.transpose((0, 2, 1)))
+ key_layer2 = key_layer2 + context_key_av
+
+ context_key_at = self.key_at(context_at).transpose((0, 2, 1))
+ context_key_at = F.interpolate(context_key_at.unsqueeze(-1),
+ size=(key_layer1.shape[2],
+ 1)).squeeze(-1)
+ context_key_at = self.transpose_for_scores(
+ context_key_at.transpose((0, 2, 1)))
+ key_layer1 = key_layer1 + context_key_at
+
+ context_val_av = self.value_at(context_av).transpose((0, 2, 1))
+ context_val_av = F.interpolate(context_val_av.unsqueeze(-1),
+ size=(value_layer2.shape[2],
+ 1)).squeeze(-1)
+ context_val_av = self.transpose_for_scores(
+ context_val_av.transpose((0, 2, 1)))
+ value_layer2 = value_layer2 + context_val_av
+
+ context_val_at = self.value_at(context_at).transpose((0, 2, 1))
+ context_val_at = F.interpolate(context_val_at.unsqueeze(-1),
+ size=(value_layer1.shape[2],
+ 1)).squeeze(-1)
+ context_val_at = self.transpose_for_scores(
+ context_val_at.transpose((0, 2, 1)))
+ value_layer1 = value_layer1 + context_val_at
+
+ context_layer1 = do_attention(query_layer1, key_layer1, value_layer1,
+ attention_mask1, self.dropout1)
+ context_layer2 = do_attention(query_layer2, key_layer2, value_layer2,
+ attention_mask2, self.dropout2)
+ context_layer3 = do_attention(query_layer3, key_layer3, value_layer3,
+ attention_mask3, self.dropout3)
+
+ return context_layer1, context_layer2, context_layer3 # vision, text, action
+
+
+class BertEntOutput(nn.Layer):
+ def __init__(
+ self,
+ bi_hidden_size,
+ hidden_size,
+ v_hidden_size,
+ v_hidden_dropout_prob,
+ hidden_dropout_prob,
+ ):
+ super(BertEntOutput, self).__init__()
+
+ self.dense1 = nn.Linear(bi_hidden_size, v_hidden_size)
+ self.LayerNorm1 = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
+ self.dropout1 = nn.Dropout(v_hidden_dropout_prob)
+
+ self.dense2 = nn.Linear(bi_hidden_size, hidden_size)
+ self.LayerNorm2 = nn.LayerNorm(hidden_size, epsilon=1e-12)
+ self.dropout2 = nn.Dropout(hidden_dropout_prob)
+
+ self.dense3 = nn.Linear(bi_hidden_size, hidden_size)
+ self.LayerNorm3 = nn.LayerNorm(hidden_size, epsilon=1e-12)
+ self.dropout3 = nn.Dropout(hidden_dropout_prob)
+
+ def forward(
+ self,
+ hidden_states1,
+ input_tensor1,
+ hidden_states2,
+ input_tensor2,
+ hidden_states3,
+ input_tensor3,
+ ):
+ context_state1 = self.dense1(hidden_states1)
+ context_state1 = self.dropout1(context_state1)
+
+ context_state2 = self.dense2(hidden_states2)
+ context_state2 = self.dropout2(context_state2)
+
+ context_state3 = self.dense3(hidden_states3)
+ context_state3 = self.dropout3(context_state3)
+
+ hidden_states1 = self.LayerNorm1(context_state1 + input_tensor1)
+ hidden_states2 = self.LayerNorm2(context_state2 + input_tensor2)
+ hidden_states3 = self.LayerNorm3(context_state3 + input_tensor3)
+
+ return hidden_states1, hidden_states2, hidden_states3
+
+
+class BertLayer(nn.Layer):
+ def __init__(self, hidden_size, intermediate_size, hidden_act,
+ hidden_dropout_prob, num_attention_heads,
+ attention_probs_dropout_prob):
+ super(BertLayer, self).__init__()
+ self.attention = BertAttention(hidden_size, hidden_dropout_prob,
+ num_attention_heads,
+ attention_probs_dropout_prob)
+ self.intermediate = BertIntermediate(hidden_size, intermediate_size,
+ hidden_act)
+ self.output = BertOutput(intermediate_size, hidden_size,
+ hidden_dropout_prob)
+
+ def forward(self, hidden_states, attention_mask):
+ attention_output, attention_probs = self.attention(
+ hidden_states, attention_mask)
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ return layer_output, attention_probs
+
+
+class BertConnectionLayer(nn.Layer):
+ def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
+ bi_hidden_size, bi_num_attention_heads,
+ attention_probs_dropout_prob, v_attention_probs_dropout_prob,
+ a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob,
+ at_attention_probs_dropout_prob, intermediate_size,
+ v_intermediate_size, a_intermediate_size, hidden_act,
+ v_hidden_act, a_hidden_act, hidden_dropout_prob,
+ v_hidden_dropout_prob, a_hidden_dropout_prob):
+ super(BertConnectionLayer, self).__init__()
+ self.ent_attention = BertEntAttention(
+ hidden_size,
+ v_hidden_size,
+ a_hidden_size,
+ bi_hidden_size,
+ attention_probs_dropout_prob,
+ v_attention_probs_dropout_prob,
+ a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob,
+ at_attention_probs_dropout_prob,
+ bi_num_attention_heads,
+ )
+
+ self.ent_output = BertEntOutput(
+ bi_hidden_size,
+ hidden_size,
+ v_hidden_size,
+ v_hidden_dropout_prob,
+ hidden_dropout_prob,
+ )
+
+ self.v_intermediate = BertIntermediate(v_hidden_size,
+ v_intermediate_size,
+ v_hidden_act)
+ self.v_output = BertOutput(v_intermediate_size, v_hidden_size,
+ v_hidden_dropout_prob)
+
+ self.t_intermediate = BertIntermediate(hidden_size, intermediate_size,
+ hidden_act)
+ self.t_output = BertOutput(intermediate_size, hidden_size,
+ hidden_dropout_prob)
+
+ self.a_intermediate = BertIntermediate(a_hidden_size,
+ a_intermediate_size,
+ a_hidden_act)
+ self.a_output = BertOutput(a_intermediate_size, a_hidden_size,
+ a_hidden_dropout_prob)
+
+ def forward(
+ self,
+ input_tensor1,
+ attention_mask1,
+ input_tensor2,
+ attention_mask2,
+ input_tensor3,
+ attention_mask3,
+ ):
+
+ ent_output1, ent_output2, ent_output3 = self.ent_attention(
+ input_tensor1, attention_mask1, input_tensor2, attention_mask2,
+ input_tensor3, attention_mask3)
+
+ attention_output1, attention_output2, attention_output3 = self.ent_output(
+ ent_output1, input_tensor1, ent_output2, input_tensor2, ent_output3,
+ input_tensor3)
+
+ intermediate_output1 = self.v_intermediate(attention_output1)
+ layer_output1 = self.v_output(intermediate_output1, attention_output1)
+
+ intermediate_output2 = self.t_intermediate(attention_output2)
+ layer_output2 = self.t_output(intermediate_output2, attention_output2)
+
+ intermediate_output3 = self.a_intermediate(attention_output3)
+ layer_output3 = self.a_output(intermediate_output3, attention_output3)
+
+ return layer_output1, layer_output2, layer_output3
+
+
+class BertEncoder(nn.Layer):
+ """
+ ActBert Encoder, consists 3 pathway of multi-BertLayers and BertConnectionLayer.
+ """
+ def __init__(
+ self,
+ v_ent_attention_id,
+ t_ent_attention_id,
+ a_ent_attention_id,
+ fixed_t_layer,
+ fixed_v_layer,
+ hidden_size,
+ v_hidden_size,
+ a_hidden_size,
+ bi_hidden_size,
+ intermediate_size,
+ v_intermediate_size,
+ a_intermediate_size,
+ hidden_act,
+ v_hidden_act,
+ a_hidden_act,
+ hidden_dropout_prob,
+ v_hidden_dropout_prob,
+ a_hidden_dropout_prob,
+ attention_probs_dropout_prob,
+ v_attention_probs_dropout_prob,
+ a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob,
+ at_attention_probs_dropout_prob,
+ num_attention_heads,
+ v_num_attention_heads,
+ a_num_attention_heads,
+ bi_num_attention_heads,
+ num_hidden_layers,
+ v_num_hidden_layers,
+ a_num_hidden_layers,
+ ):
+ super(BertEncoder, self).__init__()
+ self.v_ent_attention_id = v_ent_attention_id
+ self.t_ent_attention_id = t_ent_attention_id
+ self.a_ent_attention_id = a_ent_attention_id
+ self.fixed_t_layer = fixed_t_layer
+ self.fixed_v_layer = fixed_v_layer
+
+ layer = BertLayer(hidden_size, intermediate_size, hidden_act,
+ hidden_dropout_prob, num_attention_heads,
+ attention_probs_dropout_prob)
+ v_layer = BertLayer(v_hidden_size, v_intermediate_size, v_hidden_act,
+ v_hidden_dropout_prob, v_num_attention_heads,
+ v_attention_probs_dropout_prob)
+ a_layer = BertLayer(a_hidden_size, a_intermediate_size, a_hidden_act,
+ a_hidden_dropout_prob, a_num_attention_heads,
+ a_attention_probs_dropout_prob)
+ connect_layer = BertConnectionLayer(
+ hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
+ bi_num_attention_heads, attention_probs_dropout_prob,
+ v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
+ intermediate_size, v_intermediate_size, a_intermediate_size,
+ hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob,
+ v_hidden_dropout_prob, a_hidden_dropout_prob)
+
+ self.layer = nn.LayerList(
+ [copy.deepcopy(layer) for _ in range(num_hidden_layers)]) #12
+ self.v_layer = nn.LayerList(
+ [copy.deepcopy(v_layer) for _ in range(v_num_hidden_layers)]) #2
+ self.a_layer = nn.LayerList(
+ [copy.deepcopy(a_layer) for _ in range(a_num_hidden_layers)]) #3
+ self.c_layer = nn.LayerList([
+ copy.deepcopy(connect_layer) for _ in range(len(v_ent_attention_id))
+ ] #2 [0,1]
+ )
+
+ def forward(
+ self,
+ txt_embedding,
+ image_embedding,
+ action_embedding,
+ txt_attention_mask,
+ image_attention_mask,
+ action_attention_mask,
+ output_all_encoded_layers=True,
+ ):
+ v_start, a_start, t_start = 0, 0, 0
+ count = 0
+ all_encoder_layers_t = []
+ all_encoder_layers_v = []
+ all_encoder_layers_a = []
+
+ for v_layer_id, a_layer_id, t_layer_id in zip(self.v_ent_attention_id,
+ self.a_ent_attention_id,
+ self.t_ent_attention_id):
+ v_end = v_layer_id
+ a_end = a_layer_id
+ t_end = t_layer_id
+
+ assert self.fixed_t_layer <= t_end
+ assert self.fixed_v_layer <= v_end
+
+ ### region embedding
+ for idx in range(v_start,
+ self.fixed_v_layer): #两次训练,这个循环都没有进去 #前面的层固定住
+ with paddle.no_grad():
+ image_embedding, image_attention_probs = self.v_layer[idx](
+ image_embedding, image_attention_mask)
+ v_start = self.fixed_v_layer
+ for idx in range(v_start, v_end):
+ image_embedding, image_attention_probs = self.v_layer[idx](
+ image_embedding, image_attention_mask)
+
+ ### action embedding
+ for idx in range(a_start, a_end):
+ action_embedding, action_attention_probs = self.a_layer[idx](
+ action_embedding, action_attention_mask)
+
+ ### text embedding
+ for idx in range(t_start, self.fixed_t_layer):
+ with paddle.no_grad():
+ txt_embedding, txt_attention_probs = self.layer[idx](
+ txt_embedding, txt_attention_mask)
+ t_start = self.fixed_t_layer
+ for idx in range(t_start, t_end):
+ txt_embedding, txt_attention_probs = self.layer[idx](
+ txt_embedding, txt_attention_mask)
+
+ image_embedding, txt_embedding, action_embedding = self.c_layer[
+ count](image_embedding, image_attention_mask, txt_embedding,
+ txt_attention_mask, action_embedding,
+ action_attention_mask)
+
+ v_start = v_end
+ t_start = t_end
+ a_start = a_end
+ count += 1
+
+ if output_all_encoded_layers:
+ all_encoder_layers_t.append(txt_embedding)
+ all_encoder_layers_v.append(image_embedding)
+ all_encoder_layers_a.append(action_embedding)
+
+ for idx in range(v_start, len(self.v_layer)): # 1
+ image_embedding, image_attention_probs = self.v_layer[idx](
+ image_embedding, image_attention_mask)
+
+ for idx in range(a_start, len(self.a_layer)):
+ action_embedding, action_attention_probs = self.a_layer[idx](
+ action_embedding, action_attention_mask)
+
+ for idx in range(t_start, len(self.layer)):
+ txt_embedding, txt_attention_probs = self.layer[idx](
+ txt_embedding, txt_attention_mask)
+
+ # add the end part to finish.
+ if not output_all_encoded_layers:
+ all_encoder_layers_t.append(txt_embedding) #8, 36, 768
+ all_encoder_layers_v.append(image_embedding) #8, 37, 1024
+ all_encoder_layers_a.append(action_embedding) #8, 5, 768
+
+ return all_encoder_layers_t, all_encoder_layers_v, all_encoder_layers_a
+
+
+class BertPooler(nn.Layer):
+ """ "Pool" the model by simply taking the hidden state corresponding
+ to the first token.
+ """
+ def __init__(self, hidden_size, bi_hidden_size):
+ super(BertPooler, self).__init__()
+ self.dense = nn.Linear(hidden_size, bi_hidden_size)
+ self.activation = nn.ReLU()
+
+ def forward(self, hidden_states):
+ first_token_tensor = hidden_states[:, 0] #8, 768
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+class BertModel(nn.Layer):
+ def __init__(
+ self,
+ vocab_size,
+ max_position_embeddings,
+ type_vocab_size,
+ v_feature_size,
+ a_feature_size,
+ num_hidden_layers,
+ v_num_hidden_layers,
+ a_num_hidden_layers,
+ v_ent_attention_id,
+ t_ent_attention_id,
+ a_ent_attention_id,
+ fixed_t_layer,
+ fixed_v_layer,
+ hidden_size,
+ v_hidden_size,
+ a_hidden_size,
+ bi_hidden_size,
+ intermediate_size,
+ v_intermediate_size,
+ a_intermediate_size,
+ hidden_act,
+ v_hidden_act,
+ a_hidden_act,
+ hidden_dropout_prob,
+ v_hidden_dropout_prob,
+ a_hidden_dropout_prob,
+ attention_probs_dropout_prob,
+ v_attention_probs_dropout_prob,
+ a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob,
+ at_attention_probs_dropout_prob,
+ num_attention_heads,
+ v_num_attention_heads,
+ a_num_attention_heads,
+ bi_num_attention_heads,
+ ):
+ super(BertModel, self).__init__()
+ # initilize word embedding
+ self.embeddings = BertEmbeddings(vocab_size, max_position_embeddings,
+ type_vocab_size, hidden_size,
+ hidden_dropout_prob)
+ # initlize the region embedding
+ self.v_embeddings = BertImageEmbeddings(v_feature_size, v_hidden_size,
+ v_hidden_dropout_prob)
+ # initlize the action embedding
+ self.a_embeddings = BertActionEmbeddings(a_feature_size, a_hidden_size,
+ a_hidden_dropout_prob)
+
+ self.encoder = BertEncoder(
+ v_ent_attention_id, t_ent_attention_id, a_ent_attention_id,
+ fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size,
+ a_hidden_size, bi_hidden_size, intermediate_size,
+ v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act,
+ a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob,
+ a_hidden_dropout_prob, attention_probs_dropout_prob,
+ v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
+ num_attention_heads, v_num_attention_heads, a_num_attention_heads,
+ bi_num_attention_heads, num_hidden_layers, v_num_hidden_layers,
+ a_num_hidden_layers)
+
+ self.t_pooler = BertPooler(hidden_size, bi_hidden_size)
+ self.v_pooler = BertPooler(v_hidden_size, bi_hidden_size)
+ self.a_pooler = BertPooler(a_hidden_size, bi_hidden_size)
+
+ def forward(
+ self,
+ text_ids,
+ action_feat,
+ image_feat,
+ image_loc,
+ token_type_ids=None,
+ text_mask=None,
+ image_mask=None,
+ action_mask=None,
+ output_all_encoded_layers=False,
+ ):
+ """
+ text_ids: input text ids. Shape: [batch_size, seqence_length]
+ action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
+ image_feat: input image feature. Shape: [batch_size, region_length, image_feature_dim]]
+ image_loc: input region location. Shape: [batch_size, region_length, region_location_dim]
+ token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
+ text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
+ image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
+ action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
+ output_all_encoded_layers: is output encoded layers feature or not. Type: Bool.
+ """
+ if text_mask is None:
+ text_mask = paddle.ones_like(text_ids)
+ if token_type_ids is None:
+ token_type_ids = paddle.zeros_like(text_ids)
+ if image_mask is None:
+ image_mask = paddle.ones(image_feat.shape[0],
+ image_feat.shape[1]).astype(text_ids.dtype)
+ if action_mask is None:
+ action_mask = paddle.ones(action_feat.shape[0],
+ action_feat.shape[1]).astype(
+ text_ids.dtype)
+
+ # We create a 3D attention mask from a 2D tensor mask.
+ # Sizes are [batch_size, 1, 1, to_seq_length]
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length].
+ extended_text_mask = text_mask.unsqueeze(1).unsqueeze(2)
+ extended_image_mask = image_mask.unsqueeze(1).unsqueeze(2)
+ extended_action_mask = action_mask.unsqueeze(1).unsqueeze(2)
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ def set_mask(extended_attention_mask):
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+ return extended_attention_mask
+
+ extended_text_mask = set_mask(extended_text_mask)
+ extended_image_mask = set_mask(extended_image_mask)
+ extended_action_mask = set_mask(extended_action_mask)
+
+ t_embedding_output = self.embeddings(text_ids, token_type_ids)
+ v_embedding_output = self.v_embeddings(image_feat, image_loc)
+ a_embedding_output = self.a_embeddings(action_feat)
+
+ # var = [t_embedding_output, v_embedding_output, a_embedding_output]
+ # import numpy as np
+ # for i, item in enumerate(var):
+ # np.save('tmp/' + str(i)+'.npy', item.numpy())
+
+ encoded_layers_t, encoded_layers_v, encoded_layers_a = self.encoder(
+ t_embedding_output,
+ v_embedding_output,
+ a_embedding_output,
+ extended_text_mask,
+ extended_image_mask,
+ extended_action_mask,
+ output_all_encoded_layers=output_all_encoded_layers,
+ )
+
+ sequence_output_t = encoded_layers_t[-1] #get item from list
+ sequence_output_v = encoded_layers_v[-1]
+ sequence_output_a = encoded_layers_a[-1]
+
+ pooled_output_t = self.t_pooler(sequence_output_t)
+ pooled_output_v = self.v_pooler(sequence_output_v)
+ pooled_output_a = self.a_pooler(sequence_output_a)
+
+ if not output_all_encoded_layers:
+ encoded_layers_t = encoded_layers_t[-1]
+ encoded_layers_v = encoded_layers_v[-1]
+ encoded_layers_a = encoded_layers_a[-1]
+
+ return encoded_layers_t, encoded_layers_v, encoded_layers_a, \
+ pooled_output_t, pooled_output_v, pooled_output_a
+
+
+# For Head
+class BertPredictionHeadTransform(nn.Layer):
+ def __init__(self, hidden_size, hidden_act):
+ super(BertPredictionHeadTransform, self).__init__()
+ self.dense = nn.Linear(hidden_size, hidden_size)
+ if isinstance(hidden_act, str) or (sys.version_info[0] == 2
+ and isinstance(hidden_act, str)):
+ self.transform_act_fn = ACT2FN[hidden_act]
+ else:
+ self.transform_act_fn = hidden_act
+ self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ return hidden_states
+
+
+class BertLMPredictionHead(nn.Layer):
+ def __init__(self, hidden_size, hidden_act, bert_model_embedding_weights):
+ super(BertLMPredictionHead, self).__init__()
+ self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ assert bert_model_embedding_weights.shape[1] == hidden_size
+ vocab_size = bert_model_embedding_weights.shape[0]
+
+ # another implementation which would create another big params:
+ # self.decoder = nn.Linear(hidden_size, vocab_size) # NOTE bias default: constant 0.0
+ # self.decoder.weight = self.create_parameter(shape=[hidden_size, vocab_size],
+ # default_initializer=nn.initializer.Assign(
+ # bert_model_embedding_weights.t())) # transpose
+
+ self.decoder_weight = bert_model_embedding_weights
+ self.decoder_bias = self.create_parameter(
+ shape=[vocab_size],
+ dtype=bert_model_embedding_weights.dtype,
+ is_bias=True) # NOTE bias default: constant 0.0
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = paddle.tensor.matmul(
+ hidden_states, self.decoder_weight,
+ transpose_y=True) + self.decoder_bias
+ return hidden_states
+
+
+class BertImageActionPredictionHead(nn.Layer):
+ def __init__(self, hidden_size, hidden_act, target_size):
+ super(BertImageActionPredictionHead, self).__init__()
+ self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
+
+ self.decoder = nn.Linear(hidden_size, target_size)
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = self.decoder(hidden_states)
+ return hidden_states
+
+
+class BertPreTrainingHeads(nn.Layer):
+ def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
+ bi_hidden_size, hidden_act, v_hidden_act, a_hidden_act,
+ v_target_size, a_target_size, fusion_method,
+ bert_model_embedding_weights):
+ super(BertPreTrainingHeads, self).__init__()
+ self.predictions = BertLMPredictionHead(hidden_size, hidden_act,
+ bert_model_embedding_weights)
+ self.seq_relationship = nn.Linear(bi_hidden_size, 2)
+ self.imagePredictions = BertImageActionPredictionHead(
+ v_hidden_size, v_hidden_act, v_target_size) # visual class number
+ self.actionPredictions = BertImageActionPredictionHead(
+ a_hidden_size, a_hidden_act, a_target_size) # action class number
+ self.fusion_method = fusion_method
+ self.dropout = nn.Dropout(0.1)
+
+ def forward(self, sequence_output_t, sequence_output_v, sequence_output_a,
+ pooled_output_t, pooled_output_v, pooled_output_a):
+
+ if self.fusion_method == 'sum':
+ pooled_output = self.dropout(pooled_output_t + pooled_output_v +
+ pooled_output_a)
+ elif self.fusion_method == 'mul':
+ pooled_output = self.dropout(pooled_output_t * pooled_output_v +
+ pooled_output_a)
+ else:
+ assert False
+
+ prediction_scores_t = self.predictions(
+ sequence_output_t) # 8, 36 ,30522
+ seq_relationship_score = self.seq_relationship(pooled_output) # 8, 2
+ prediction_scores_v = self.imagePredictions(
+ sequence_output_v) # 8, 37, 1601
+ prediction_scores_a = self.actionPredictions(
+ sequence_output_a) # 8, 5, 401
+
+ return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
+
+
+@BACKBONES.register()
+class BertForMultiModalPreTraining(nn.Layer):
+ """BERT model with multi modal pre-training heads.
+ """
+ def __init__(
+ self,
+ vocab_size=30522,
+ max_position_embeddings=512,
+ type_vocab_size=2,
+ v_target_size=1601,
+ a_target_size=700,
+ v_feature_size=2048,
+ a_feature_size=2048,
+ num_hidden_layers=12,
+ v_num_hidden_layers=2,
+ a_num_hidden_layers=3,
+ t_ent_attention_id=[10, 11],
+ v_ent_attention_id=[0, 1],
+ a_ent_attention_id=[0, 1],
+ fixed_t_layer=0,
+ fixed_v_layer=0,
+ hidden_size=768,
+ v_hidden_size=1024,
+ a_hidden_size=768,
+ bi_hidden_size=1024,
+ intermediate_size=3072,
+ v_intermediate_size=1024,
+ a_intermediate_size=3072,
+ hidden_act="gelu",
+ v_hidden_act="gelu",
+ a_hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ v_hidden_dropout_prob=0.1,
+ a_hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ v_attention_probs_dropout_prob=0.1,
+ a_attention_probs_dropout_prob=0.1,
+ av_attention_probs_dropout_prob=0.1,
+ at_attention_probs_dropout_prob=0.1,
+ num_attention_heads=12,
+ v_num_attention_heads=8,
+ a_num_attention_heads=12,
+ bi_num_attention_heads=8,
+ fusion_method="mul",
+ pretrained=None,
+ ):
+ """
+ vocab_size: vocabulary size. Default: 30522.
+ max_position_embeddings: max position id. Default: 512.
+ type_vocab_size: max segment id. Default: 2.
+ v_target_size: class number of visual word. Default: 1601.
+ a_target_size: class number of action word. Default: 700.
+ v_feature_size: input visual feature dimension. Default: 2048.
+ a_feature_size: input action feature dimension. Default: 2048.
+ num_hidden_layers: number of BertLayer in text transformer. Default: 12.
+ v_num_hidden_layers: number of BertLayer in visual transformer. Default: 2.
+ a_num_hidden_layers: number of BertLayer in action transformer. Default:3.
+ t_ent_attention_id: index id of BertConnectionLayer in text transformer. Default: [10, 11].
+ v_ent_attention_id: index id of BertConnectionLayer in visual transformer. Default:[0, 1].
+ a_ent_attention_id: index id of BertConnectionLayer in action transformer. Default:[0, 1].
+ fixed_t_layer: index id of fixed BertLayer in text transformer. Default: 0.
+ fixed_v_layer: index id of fixed BertLayer in visual transformer. Default: 0.
+ hidden_size: hidden size in text BertLayer. Default: 768.
+ v_hidden_size: hidden size in visual BertLayer. Default: 1024.
+ a_hidden_size: hidden size in action BertLayer. Default: 768.
+ bi_hidden_size: hidden size in BertConnectionLayer. Default: 1024,
+ intermediate_size: intermediate size in text BertLayer. Default: 3072.
+ v_intermediate_size: intermediate size in visual BertLayer. Default: 1024.
+ a_intermediate_size: intermediate size in text BertLayer. Default: 3072.
+ hidden_act: hidden activation function in text BertLayer. Default: "gelu".
+ v_hidden_act: hidden activation function in visual BertLayer. Default: "gelu".
+ a_hidden_act: hidden activation function in action BertLayer. Default: "gelu".
+ hidden_dropout_prob: hidden dropout probability in text Embedding Layer. Default: 0.1
+ v_hidden_dropout_prob: hidden dropout probability in visual Embedding Layer. Default: 0.1
+ a_hidden_dropout_prob: hidden dropout probability in action Embedding Layer. Default: 0.1
+ attention_probs_dropout_prob: attention dropout probability in text BertLayer. Default: 0.1
+ v_attention_probs_dropout_prob: attention dropout probability in visual BertLayer. Default: 0.1
+ a_attention_probs_dropout_prob: attention dropout probability in action BertLayer. Default: 0.1
+ av_attention_probs_dropout_prob: attention dropout probability in action-visual BertConnectionLayer. Default: 0.1
+ at_attention_probs_dropout_prob: attention dropout probability in action-text BertConnectionLayer. Default: 0.1
+ num_attention_heads: number of heads in text BertLayer. Default: 12.
+ v_num_attention_heads: number of heads in visual BertLayer. Default: 8.
+ a_num_attention_heads: number of heads in action BertLayer. Default: 12.
+ bi_num_attention_heads: number of heads in BertConnectionLayer. Default: 8.
+ fusion_method: methods of fusing pooled output from 3 transformer. Default: "mul".
+ """
+ super(BertForMultiModalPreTraining, self).__init__()
+ self.pretrained = pretrained
+ self.vocab_size = vocab_size
+ self.a_target_size = a_target_size
+
+ self.bert = BertModel(
+ vocab_size,
+ max_position_embeddings,
+ type_vocab_size,
+ v_feature_size,
+ a_feature_size,
+ num_hidden_layers,
+ v_num_hidden_layers,
+ a_num_hidden_layers,
+ v_ent_attention_id,
+ t_ent_attention_id,
+ a_ent_attention_id,
+ fixed_t_layer,
+ fixed_v_layer,
+ hidden_size,
+ v_hidden_size,
+ a_hidden_size,
+ bi_hidden_size,
+ intermediate_size,
+ v_intermediate_size,
+ a_intermediate_size,
+ hidden_act,
+ v_hidden_act,
+ a_hidden_act,
+ hidden_dropout_prob,
+ v_hidden_dropout_prob,
+ a_hidden_dropout_prob,
+ attention_probs_dropout_prob,
+ v_attention_probs_dropout_prob,
+ a_attention_probs_dropout_prob,
+ av_attention_probs_dropout_prob,
+ at_attention_probs_dropout_prob,
+ num_attention_heads,
+ v_num_attention_heads,
+ a_num_attention_heads,
+ bi_num_attention_heads,
+ )
+ self.cls = BertPreTrainingHeads(
+ hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
+ hidden_act, v_hidden_act, a_hidden_act, v_target_size,
+ a_target_size, fusion_method,
+ self.bert.embeddings.word_embeddings.weight)
+
+ def init_weights(self):
+ """Initiate the parameters.
+ """
+ if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ for layer in self.sublayers():
+ if isinstance(layer, (nn.Linear, nn.Embedding)):
+ weight_init_(layer, 'Normal', std=0.02)
+ elif isinstance(layer, nn.LayerNorm):
+ weight_init_(layer, 'Constant', value=1)
+
+ def forward(
+ self,
+ text_ids, #8,36
+ action_feat, #8,5,2048
+ image_feat, #8,37,2048
+ image_loc, #8,37,5
+ token_type_ids=None, #8,36
+ text_mask=None, #8,36
+ image_mask=None, #8,37
+ action_mask=None, #8,5
+ ):
+ """
+ text_ids: input text ids. Shape: [batch_size, seqence_length]
+ action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
+ image_feat: input image feature. Shape: [batch_size, region_length+1, image_feature_dim]], add 1 for image global feature.
+ image_loc: input region location. Shape: [batch_size, region_length+1, region_location_dim], add 1 for image global feature location.
+ token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
+ text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
+ image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
+ action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
+ """
+ sequence_output_t, sequence_output_v, sequence_output_a, \
+ pooled_output_t, pooled_output_v, pooled_output_a = self.bert(
+ text_ids,
+ action_feat,
+ image_feat,
+ image_loc,
+ token_type_ids,
+ text_mask,
+ image_mask,
+ action_mask,
+ output_all_encoded_layers=False,
+ )
+
+ prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.cls(
+ sequence_output_t, sequence_output_v, sequence_output_a,
+ pooled_output_t, pooled_output_v, pooled_output_a)
+
+ return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
diff --git a/paddlevideo/modeling/backbones/adds.py b/paddlevideo/modeling/backbones/adds.py
new file mode 100644
index 0000000000000000000000000000000000000000..21cd212cb23a08fe8985d7ab1b15cb5f8f7596f7
--- /dev/null
+++ b/paddlevideo/modeling/backbones/adds.py
@@ -0,0 +1,1146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import BatchNorm2D, Conv2D
+from paddle.nn.initializer import Constant, Normal
+from paddle.vision.models import ResNet
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import kaiming_normal_, _calculate_fan_in_and_fan_out
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+normal_ = Normal(mean=0, std=1e-3)
+
+
+def disp_to_depth(disp, min_depth, max_depth):
+ """Convert network's sigmoid output into depth prediction
+ The formula for this conversion is given in the 'additional considerations'
+ section of the paper.
+ """
+ min_disp = 1 / max_depth
+ max_disp = 1 / min_depth
+ scaled_disp = min_disp + (max_disp - min_disp) * disp
+ depth = 1 / scaled_disp
+ return scaled_disp, depth
+
+
+def gram_matrix(y):
+ (b, ch, h, w) = y.shape
+ features = y.reshape([b, ch, w * h])
+ features_t = paddle.transpose(features, [0, 2, 1])
+ gram = features.bmm(features_t) / (ch * h * w)
+ return gram
+
+
+def convt_bn_relu(in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ padding=0,
+ output_padding=0,
+ bn=True,
+ relu=True):
+ bias = not bn
+ layers = []
+ layers.append(
+ nn.Conv2DTranspose(in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ output_padding,
+ bias_attr=bias))
+ if bn:
+ layers.append(nn.BatchNorm2D(out_channels))
+
+ if relu:
+ layers.append(nn.LeakyReLU(0.2))
+ layers = nn.Sequential(*layers)
+
+ # initialize the weights
+ for m in layers.sublayers(include_self=True):
+ if isinstance(m, nn.Conv2DTranspose):
+ normal_(m.weight)
+ if m.bias is not None:
+ zeros_(m.bias)
+ elif isinstance(m, nn.BatchNorm2D):
+ ones_(m.weight)
+ zeros_(m.bias)
+ return layers
+
+
+def transformation_from_parameters(axisangle, translation, invert=False):
+ """Convert the network's (axisangle, translation) output into a 4x4 matrix
+ """
+ R = rot_from_axisangle(axisangle)
+ t = translation.clone()
+
+ if invert:
+ R = R.transpose([0, 2, 1])
+ t *= -1
+
+ T = get_translation_matrix(t)
+
+ if invert:
+ M = paddle.matmul(R, T)
+ else:
+ M = paddle.matmul(T, R)
+
+ return M
+
+
+def get_translation_matrix(translation_vector):
+ """Convert a translation vector into a 4x4 transformation matrix
+ """
+ t = translation_vector.reshape([-1, 3, 1])
+ gather_object = paddle.stack([
+ paddle.zeros([
+ translation_vector.shape[0],
+ ], paddle.float32),
+ paddle.ones([
+ translation_vector.shape[0],
+ ], paddle.float32),
+ paddle.squeeze(t[:, 0], axis=-1),
+ paddle.squeeze(t[:, 1], axis=-1),
+ paddle.squeeze(t[:, 2], axis=-1),
+ ])
+ gather_index = paddle.to_tensor([
+ [1],
+ [0],
+ [0],
+ [2],
+ [0],
+ [1],
+ [0],
+ [3],
+ [0],
+ [0],
+ [1],
+ [4],
+ [0],
+ [0],
+ [0],
+ [1],
+ ])
+ T = paddle.gather_nd(gather_object, gather_index)
+ T = T.reshape([4, 4, -1]).transpose((2, 0, 1))
+ return T
+
+
+def rot_from_axisangle(vec):
+ """Convert an axisangle rotation into a 4x4 transformation matrix
+ (adapted from https://github.com/Wallacoloo/printipi)
+ Input 'vec' has to be Bx1x3
+ """
+ angle = paddle.norm(vec, 2, 2, True)
+ axis = vec / (angle + 1e-7)
+
+ ca = paddle.cos(angle)
+ sa = paddle.sin(angle)
+ C = 1 - ca
+
+ x = axis[..., 0].unsqueeze(1)
+ y = axis[..., 1].unsqueeze(1)
+ z = axis[..., 2].unsqueeze(1)
+
+ xs = x * sa
+ ys = y * sa
+ zs = z * sa
+ xC = x * C
+ yC = y * C
+ zC = z * C
+ xyC = x * yC
+ yzC = y * zC
+ zxC = z * xC
+
+ gather_object = paddle.stack([
+ paddle.squeeze(x * xC + ca, axis=(-1, -2)),
+ paddle.squeeze(xyC - zs, axis=(-1, -2)),
+ paddle.squeeze(zxC + ys, axis=(-1, -2)),
+ paddle.squeeze(xyC + zs, axis=(-1, -2)),
+ paddle.squeeze(y * yC + ca, axis=(-1, -2)),
+ paddle.squeeze(yzC - xs, axis=(-1, -2)),
+ paddle.squeeze(zxC - ys, axis=(-1, -2)),
+ paddle.squeeze(yzC + xs, axis=(-1, -2)),
+ paddle.squeeze(z * zC + ca, axis=(-1, -2)),
+ paddle.ones([
+ vec.shape[0],
+ ], dtype=paddle.float32),
+ paddle.zeros([
+ vec.shape[0],
+ ], dtype=paddle.float32)
+ ])
+ gather_index = paddle.to_tensor([
+ [0],
+ [1],
+ [2],
+ [10],
+ [3],
+ [4],
+ [5],
+ [10],
+ [6],
+ [7],
+ [8],
+ [10],
+ [10],
+ [10],
+ [10],
+ [9],
+ ])
+ rot = paddle.gather_nd(gather_object, gather_index)
+ rot = rot.reshape([4, 4, -1]).transpose((2, 0, 1))
+ return rot
+
+
+def upsample(x):
+ """Upsample input tensor by a factor of 2
+ """
+ return F.interpolate(x, scale_factor=2, mode="nearest")
+
+
+def get_smooth_loss(disp, img):
+ """Computes the smoothness loss for a disparity image
+ The color image is used for edge-aware smoothness
+ """
+ grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
+ grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
+
+ grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),
+ 1,
+ keepdim=True)
+ grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),
+ 1,
+ keepdim=True)
+
+ grad_disp_x *= paddle.exp(-grad_img_x)
+ grad_disp_y *= paddle.exp(-grad_img_y)
+
+ return grad_disp_x.mean() + grad_disp_y.mean()
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+ """3x3 convolution with padding"""
+ return nn.Conv2D(in_planes,
+ out_planes,
+ kernel_size=3,
+ stride=stride,
+ padding=dilation,
+ groups=groups,
+ bias_attr=False,
+ dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+ """1x1 convolution"""
+ return nn.Conv2D(in_planes,
+ out_planes,
+ kernel_size=1,
+ stride=stride,
+ bias_attr=False)
+
+
+def resnet_multiimage_input(num_layers, num_input_images=1):
+ """Constructs a ResNet model.
+ Args:
+ num_layers (int): Number of resnet layers. Must be 18 or 50
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ num_input_images (int): Number of frames stacked as input
+ """
+ assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet"
+ blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+
+ block_type = {18: BasicBlock, 50: Bottleneck}[num_layers]
+
+ model = ResNetMultiImageInput(block_type,
+ num_layers,
+ blocks,
+ num_input_images=num_input_images)
+ model.init_weights()
+ return model
+
+
+class ConvBlock(nn.Layer):
+ """Layer to perform a convolution followed by ELU
+ """
+ def __init__(self, in_channels, out_channels):
+ super(ConvBlock, self).__init__()
+
+ self.conv = Conv3x3(in_channels, out_channels)
+ self.nonlin = nn.ELU()
+
+ def forward(self, x):
+ out = self.conv(x)
+ out = self.nonlin(out)
+ return out
+
+
+class Conv3x3(nn.Layer):
+ """Layer to pad and convolve input
+ """
+ def __init__(self, in_channels, out_channels, use_refl=True):
+ super(Conv3x3, self).__init__()
+
+ if use_refl:
+ self.pad = nn.Pad2D(1, mode='reflect')
+ else:
+ self.pad = nn.Pad2D(1)
+ self.conv = nn.Conv2D(int(in_channels), int(out_channels), 3)
+
+ def forward(self, x):
+ out = self.pad(x)
+ out = self.conv(out)
+ return out
+
+
+class BackprojectDepth(nn.Layer):
+ """Layer to transform a depth image into a point cloud
+ """
+ def __init__(self, batch_size, height, width):
+ super(BackprojectDepth, self).__init__()
+
+ self.batch_size = batch_size
+ self.height = height
+ self.width = width
+
+ meshgrid = np.meshgrid(range(self.width),
+ range(self.height),
+ indexing='xy')
+ id_coords = np.stack(meshgrid, axis=0).astype(np.float32)
+ self.id_coords = self.create_parameter(shape=list(id_coords.shape),
+ dtype=paddle.float32)
+ self.id_coords.set_value(id_coords)
+ self.add_parameter("id_coords", self.id_coords)
+ self.id_coords.stop_gradient = True
+
+ self.ones = self.create_parameter(
+ shape=[self.batch_size, 1, self.height * self.width],
+ default_initializer=ones_)
+ self.add_parameter("ones", self.ones)
+ self.ones.stop_gradient = True
+
+ pix_coords = paddle.unsqueeze(
+ paddle.stack([
+ self.id_coords[0].reshape([
+ -1,
+ ]), self.id_coords[1].reshape([
+ -1,
+ ])
+ ], 0), 0)
+ pix_coords = pix_coords.tile([batch_size, 1, 1])
+ pix_coords = paddle.concat([pix_coords, self.ones], 1)
+ self.pix_coords = self.create_parameter(shape=list(pix_coords.shape), )
+ self.pix_coords.set_value(pix_coords)
+ self.add_parameter("pix_coords", self.pix_coords)
+ self.pix_coords.stop_gradient = True
+
+ def forward(self, depth, inv_K):
+ cam_points = paddle.matmul(inv_K[:, :3, :3], self.pix_coords)
+ cam_points = depth.reshape([self.batch_size, 1, -1]) * cam_points
+ cam_points = paddle.concat([cam_points, self.ones], 1)
+
+ return cam_points
+
+
+class Project3D(nn.Layer):
+ """Layer which projects 3D points into a camera with intrinsics K and at position T
+ """
+ def __init__(self, batch_size, height, width, eps=1e-7):
+ super(Project3D, self).__init__()
+
+ self.batch_size = batch_size
+ self.height = height
+ self.width = width
+ self.eps = eps
+
+ def forward(self, points, K, T):
+ P = paddle.matmul(K, T)[:, :3, :]
+
+ cam_points = paddle.matmul(P, points)
+
+ pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) +
+ self.eps)
+ pix_coords = pix_coords.reshape(
+ [self.batch_size, 2, self.height, self.width])
+ pix_coords = pix_coords.transpose([0, 2, 3, 1])
+ pix_coords[..., 0] /= self.width - 1
+ pix_coords[..., 1] /= self.height - 1
+ pix_coords = (pix_coords - 0.5) * 2
+ return pix_coords
+
+
+class SSIM(nn.Layer):
+ """Layer to compute the SSIM loss between a pair of images
+ """
+ def __init__(self):
+ super(SSIM, self).__init__()
+ self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)
+
+ self.refl = nn.Pad2D(1, mode='reflect')
+
+ self.C1 = 0.01**2
+ self.C2 = 0.03**2
+
+ def forward(self, x, y):
+ x = self.refl(x)
+ y = self.refl(y)
+
+ mu_x = self.mu_x_pool(x)
+ mu_y = self.mu_y_pool(y)
+
+ sigma_x = self.sig_x_pool(x**2) - mu_x**2
+ sigma_y = self.sig_y_pool(y**2) - mu_y**2
+ sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
+
+ SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
+ SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)
+
+ return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)
+
+
+class ResNetMultiImageInput(ResNet):
+ """Constructs a resnet model with varying number of input images.
+ Adapted from https://github.com/pypaddle/vision/blob/master/paddlevision/models/resnet.py
+ """
+ def __init__(self, block, depth, layers, num_input_images=1):
+ super(ResNetMultiImageInput, self).__init__(block, depth)
+ self.inplanes = 64
+ self.conv1 = nn.Conv2D(num_input_images * 3,
+ 64,
+ kernel_size=7,
+ stride=2,
+ padding=3,
+ bias_attr=False)
+ self.bn1 = nn.BatchNorm2D(64)
+ self.relu = nn.ReLU()
+ self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+ self.layer1 = self._make_layer(block, 64, layers[0])
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+ def init_weights(self):
+ for layer in self.sublayers(include_self=True):
+ if isinstance(layer, nn.Conv2D):
+ kaiming_normal_(layer.weight,
+ mode='fan_out',
+ nonlinearity='relu')
+ elif isinstance(layer, nn.BatchNorm2D):
+ ones_(layer.weight)
+ zeros_(layer.bias)
+
+
+class ConvBNLayer(nn.Layer):
+ """Conv2D and BatchNorm2D layer.
+
+ Args:
+ in_channels (int): Number of channels for the input.
+ out_channels (int): Number of channels for the output.
+ kernel_size (int): Kernel size.
+ stride (int): Stride in the Conv2D layer. Default: 1.
+ groups (int): Groups in the Conv2D, Default: 1.
+ act (str): Indicate activation after BatchNorm2D layer.
+ name (str): the name of an instance of ConvBNLayer.
+
+ Note: weight and bias initialization include initialize values
+ and name the restored parameters, values initialization
+ are explicit declared in the ```init_weights``` method.
+
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ groups=1,
+ act=None,
+ name=None):
+ super(ConvBNLayer, self).__init__()
+ self._conv = Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=(kernel_size - 1) // 2,
+ groups=groups,
+ bias_attr=False)
+
+ self._act = act
+
+ self._batch_norm = BatchNorm2D(out_channels)
+
+ def forward(self, inputs):
+ y = self._conv(inputs)
+ y = self._batch_norm(y)
+ if self._act:
+ y = getattr(paddle.nn.functional, self._act)(y)
+ return y
+
+
+class BasicBlock(nn.Layer):
+ expansion = 1
+
+ def __init__(self,
+ inplanes,
+ planes,
+ stride=1,
+ downsample=None,
+ groups=1,
+ base_width=64,
+ dilation=1,
+ norm_layer=None):
+ super(BasicBlock, self).__init__()
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2D
+ if groups != 1 or base_width != 64:
+ raise ValueError(
+ 'BasicBlock only supports groups=1 and base_width=64')
+ if dilation > 1:
+ raise NotImplementedError(
+ "Dilation > 1 not supported in BasicBlock")
+ # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+ self.conv1 = conv3x3(inplanes, planes, stride)
+ self.bn1 = norm_layer(planes)
+ self.relu = nn.ReLU()
+ self.conv2 = conv3x3(planes, planes)
+ self.bn2 = norm_layer(planes)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
+
+
+class Bottleneck(nn.Layer):
+ # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+ # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+ # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+ # This variant is also known as ResNet V1.5 and improves accuracy according to
+ # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+ expansion = 4
+
+ def __init__(self,
+ inplanes,
+ planes,
+ stride=1,
+ downsample=None,
+ groups=1,
+ base_width=64,
+ dilation=1,
+ norm_layer=None):
+ super(Bottleneck, self).__init__()
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2D
+ width = int(planes * (base_width / 64.)) * groups
+
+ self.conv1 = conv1x1(inplanes, width)
+ self.bn1 = norm_layer(width)
+ self.conv2 = conv3x3(width, width, stride, groups, dilation)
+ self.bn2 = norm_layer(width)
+ self.conv3 = conv1x1(width, planes * self.expansion)
+ self.bn3 = norm_layer(planes * self.expansion)
+ self.relu = nn.ReLU()
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
+
+
+class DepthDecoder(nn.Layer):
+ def __init__(self,
+ num_ch_enc,
+ scales=range(4),
+ num_output_channels=1,
+ use_skips=True):
+ super(DepthDecoder, self).__init__()
+
+ self.num_output_channels = num_output_channels
+ self.use_skips = use_skips
+ self.upsample_mode = 'nearest'
+ self.scales = scales
+
+ self.num_ch_enc = num_ch_enc
+ self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+
+ # decoder
+ self.convs = OrderedDict()
+ for i in range(4, -1, -1):
+ # upconv_0
+ num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i +
+ 1]
+ num_ch_out = self.num_ch_dec[i]
+ self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+
+ # upconv_1
+ num_ch_in = self.num_ch_dec[i]
+ if self.use_skips and i > 0:
+ num_ch_in += self.num_ch_enc[i - 1]
+ num_ch_out = self.num_ch_dec[i]
+ self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+
+ for s in self.scales:
+ self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s],
+ self.num_output_channels)
+
+ self.decoder = nn.LayerList(list(self.convs.values()))
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, input_features):
+ outputs = {}
+
+ # decoder
+ x = input_features[-1]
+ for i in range(4, -1, -1):
+ x = self.convs[("upconv", i, 0)](x)
+ x = [upsample(x)]
+ if self.use_skips and i > 0:
+ x += [input_features[i - 1]]
+ x = paddle.concat(x, 1)
+ x = self.convs[("upconv", i, 1)](x)
+ if i in self.scales:
+ outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv",
+ i)](x))
+ return outputs
+
+
+class PoseDecoder(nn.Layer):
+ def __init__(self,
+ num_ch_enc,
+ num_input_features,
+ num_frames_to_predict_for=None,
+ stride=1):
+ super(PoseDecoder, self).__init__()
+
+ self.num_ch_enc = num_ch_enc
+ self.num_input_features = num_input_features
+
+ if num_frames_to_predict_for is None:
+ num_frames_to_predict_for = num_input_features - 1
+ self.num_frames_to_predict_for = num_frames_to_predict_for
+
+ self.convs = OrderedDict()
+ self.convs[("squeeze")] = nn.Conv2D(self.num_ch_enc[-1], 256, 1)
+ self.convs[("pose", 0)] = nn.Conv2D(num_input_features * 256, 256, 3,
+ stride, 1)
+ self.convs[("pose", 1)] = nn.Conv2D(256, 256, 3, stride, 1)
+ self.convs[("pose", 2)] = nn.Conv2D(256, 6 * num_frames_to_predict_for,
+ 1)
+
+ self.relu = nn.ReLU()
+
+ self.net = nn.LayerList(list(self.convs.values()))
+
+ def forward(self, input_features):
+ last_features = [f[-1] for f in input_features]
+
+ cat_features = [
+ self.relu(self.convs["squeeze"](f)) for f in last_features
+ ]
+ cat_features = paddle.concat(cat_features, 1)
+
+ out = cat_features
+ for i in range(3):
+ out = self.convs[("pose", i)](out)
+ if i != 2:
+ out = self.relu(out)
+
+ out = out.mean(3).mean(2)
+
+ out = 0.01 * out.reshape([-1, self.num_frames_to_predict_for, 1, 6])
+
+ axisangle = out[..., :3]
+ translation = out[..., 3:]
+
+ return axisangle, translation
+
+
+class ResnetEncoder(nn.Layer):
+ """Pypaddle module for a resnet encoder
+ """
+ def __init__(self, num_layers, pretrained=False, num_input_images=1):
+ super(ResnetEncoder, self).__init__()
+
+ self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+
+ resnets = {
+ 18: paddle.vision.models.resnet18,
+ 34: paddle.vision.models.resnet34,
+ 50: paddle.vision.models.resnet50,
+ 101: paddle.vision.models.resnet101,
+ 152: paddle.vision.models.resnet152
+ }
+
+ if num_layers not in resnets:
+ raise ValueError(
+ "{} is not a valid number of resnet layers".format(num_layers))
+
+ if num_input_images > 1:
+ self.encoder = resnet_multiimage_input(num_layers, pretrained,
+ num_input_images)
+ else:
+ self.encoder = resnets[num_layers](pretrained)
+
+ if num_layers > 34:
+ self.num_ch_enc[1:] *= 4
+
+ ######################################
+ # night public first conv
+ ######################################
+ self.conv1 = nn.Conv2D(3,
+ 64,
+ kernel_size=7,
+ stride=2,
+ padding=3,
+ bias_attr=False)
+ self.bn1 = nn.BatchNorm2D(64)
+ self.relu = nn.ReLU() # NOTE
+
+ self.conv_shared = nn.Conv2D(512, 64, kernel_size=1)
+
+ ##########################################
+ # private source encoder, day
+ ##########################################
+ self.encoder_day = resnets[num_layers](pretrained)
+ self.conv_diff_day = nn.Conv2D(
+ 512, 64, kernel_size=1) # no bn after conv, so bias=true
+
+ ##########################################
+ # private target encoder, night
+ ##########################################
+ self.encoder_night = resnets[num_layers](pretrained)
+ self.conv_diff_night = nn.Conv2D(512, 64, kernel_size=1)
+
+ ######################################
+ # shared decoder (small decoder), use a simple de-conv to upsample the features with no skip connection
+ ######################################
+ self.convt5 = convt_bn_relu(in_channels=512,
+ out_channels=256,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ output_padding=1)
+ self.convt4 = convt_bn_relu(in_channels=256,
+ out_channels=128,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ output_padding=1)
+ self.convt3 = convt_bn_relu(in_channels=128,
+ out_channels=64,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ output_padding=1)
+ self.convt2 = convt_bn_relu(in_channels=64,
+ out_channels=64,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ output_padding=1)
+ self.convt1 = convt_bn_relu(in_channels=64,
+ out_channels=64,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ output_padding=1)
+ self.convtf = nn.Conv2D(64, 3, kernel_size=1, stride=1, padding=0)
+
+ def forward(self, input_image, is_night):
+ if self.training:
+ result = []
+ input_data = (input_image - 0.45) / 0.225
+ if is_night == 'day':
+ # source private encoder, day
+ private_feature = self.encoder_day.conv1(input_data)
+ private_feature = self.encoder_day.bn1(private_feature)
+ private_feature = self.encoder_day.relu(private_feature)
+ private_feature = self.encoder_day.maxpool(private_feature)
+ private_feature = self.encoder_day.layer1(private_feature)
+ private_feature = self.encoder_day.layer2(private_feature)
+ private_feature = self.encoder_day.layer3(private_feature)
+ private_feature = self.encoder_day.layer4(private_feature)
+ private_code = self.conv_diff_day(private_feature)
+ private_gram = gram_matrix(private_feature)
+ result.append(private_code)
+ result.append(private_gram)
+
+ elif is_night == 'night':
+ # target private encoder, night
+ private_feature = self.encoder_night.conv1(input_data)
+ private_feature = self.encoder_night.bn1(private_feature)
+ private_feature = self.encoder_night.relu(private_feature)
+ private_feature = self.encoder_night.maxpool(private_feature)
+ private_feature = self.encoder_night.layer1(private_feature)
+ private_feature = self.encoder_night.layer2(private_feature)
+ private_feature = self.encoder_night.layer3(private_feature)
+ private_feature = self.encoder_night.layer4(private_feature)
+ private_code = self.conv_diff_night(private_feature)
+
+ private_gram = gram_matrix(private_feature)
+ result.append(private_code)
+ result.append(private_gram)
+
+ # shared encoder
+ self.features = []
+ x = (input_image - 0.45) / 0.225
+ if is_night == 'day':
+ x = self.encoder.conv1(x)
+ x = self.encoder.bn1(x)
+ self.features.append(self.encoder.relu(x))
+ else:
+ x = self.conv1(x)
+ x = self.bn1(x)
+ self.features.append(self.relu(x))
+
+ self.features.append(
+ self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
+ self.features.append(self.encoder.layer2(self.features[-1]))
+ self.features.append(self.encoder.layer3(self.features[-1]))
+ self.features.append(self.encoder.layer4(self.features[-1]))
+
+ if self.training:
+ shared_code = self.conv_shared(self.features[-1])
+ shared_gram = gram_matrix(self.features[-1])
+ result.append(shared_code) # use this to calculate loss of diff
+ result.append(shared_gram)
+ result.append(
+ self.features[-1]) # use this to calculate loss of similarity
+
+ union_code = private_feature + self.features[-1]
+ rec_code = self.convt5(union_code)
+ rec_code = self.convt4(rec_code)
+ rec_code = self.convt3(rec_code)
+ rec_code = self.convt2(rec_code)
+ rec_code = self.convt1(rec_code)
+ rec_code = self.convtf(rec_code)
+ result.append(rec_code)
+
+ return self.features, result
+ else:
+ return self.features
+
+
+class ResnetEncoder_pose(nn.Layer):
+ """Pypaddle module for a resnet encoder
+ """
+ def __init__(self, num_layers, pretrained=False, num_input_images=1):
+ super(ResnetEncoder_pose, self).__init__()
+
+ self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+ resnets = {
+ 18: paddle.vision.models.resnet18,
+ 34: paddle.vision.models.resnet34,
+ 50: paddle.vision.models.resnet50,
+ 101: paddle.vision.models.resnet101,
+ 152: paddle.vision.models.resnet152
+ }
+
+ if num_layers not in resnets:
+ raise ValueError(
+ "{} is not a valid number of resnet layers".format(num_layers))
+
+ if num_input_images > 1:
+ self.encoder = resnet_multiimage_input(num_layers, num_input_images)
+ else:
+ self.encoder = resnets[num_layers](pretrained)
+
+ if num_layers > 34:
+ self.num_ch_enc[1:] *= 4
+
+ def forward(self, input_image):
+ features = []
+ x = (input_image - 0.45) / 0.225
+ x = self.encoder.conv1(x)
+ x = self.encoder.bn1(x)
+ features.append(self.encoder.relu(x))
+ features.append(self.encoder.layer1(self.encoder.maxpool(features[-1])))
+ features.append(self.encoder.layer2(features[-1]))
+ features.append(self.encoder.layer3(features[-1]))
+ features.append(self.encoder.layer4(features[-1]))
+
+ return features
+
+
+@BACKBONES.register()
+class ADDS_DepthNet(nn.Layer):
+ def __init__(self,
+ num_layers=18,
+ frame_ids=[0, -1, 1],
+ height=256,
+ width=512,
+ batch_size=6,
+ pose_model_input="pairs",
+ use_stereo=False,
+ only_depth_encoder=False,
+ pretrained=None,
+ scales=[0, 1, 2, 3],
+ min_depth=0.1,
+ max_depth=100.0,
+ pose_model_type='separate_resnet',
+ v1_multiscale=False,
+ predictive_mask=False,
+ disable_automasking=False):
+ super(ADDS_DepthNet, self).__init__()
+ self.num_layers = num_layers
+ self.height = height
+ self.width = width
+ self.batch_size = batch_size
+ self.frame_ids = frame_ids
+ self.pose_model_input = pose_model_input
+ self.use_stereo = use_stereo
+ self.only_depth_encoder = only_depth_encoder
+ self.pretrained = pretrained
+ self.scales = scales
+ self.pose_model_type = pose_model_type
+ self.predictive_mask = predictive_mask
+ self.disable_automasking = disable_automasking
+ self.v1_multiscale = v1_multiscale
+ self.min_depth = min_depth
+ self.max_depth = max_depth
+
+ self.num_input_frames = len(self.frame_ids)
+ self.num_pose_frames = 2 if self.pose_model_input == "pairs" else self.num_input_frames
+
+ assert self.frame_ids[0] == 0, "frame_ids must start with 0"
+
+ self.use_pose_net = not (self.use_stereo and self.frame_ids == [0])
+
+ self.encoder = ResnetEncoder(self.num_layers)
+ if not self.only_depth_encoder:
+ self.depth = DepthDecoder(self.encoder.num_ch_enc, self.scales)
+ if self.use_pose_net and not self.only_depth_encoder:
+ if self.pose_model_type == "separate_resnet":
+ self.pose_encoder = ResnetEncoder_pose(
+ self.num_layers, num_input_images=self.num_pose_frames)
+ self.pose = PoseDecoder(self.pose_encoder.num_ch_enc,
+ num_input_features=1,
+ num_frames_to_predict_for=2)
+
+ self.backproject_depth = {}
+ self.project_3d = {}
+ for scale in self.scales:
+ h = self.height // (2**scale)
+ w = self.width // (2**scale)
+
+ self.backproject_depth[scale] = BackprojectDepth(
+ self.batch_size, h, w)
+ self.project_3d[scale] = Project3D(batch_size, h, w)
+
+ def init_weights(self):
+ """First init model's weight"""
+ for m in self.sublayers(include_self=True):
+ if isinstance(m, nn.Conv2D):
+ kaiming_normal_(m.weight, a=math.sqrt(5))
+ if m.bias is not None:
+ fan_in, _ = _calculate_fan_in_and_fan_out(m.weight)
+ bound = 1 / math.sqrt(fan_in)
+ uniform_ = paddle.nn.initializer.Uniform(-bound, bound)
+ uniform_(m.bias)
+ """Second, if provide pretrained ckpt, load it"""
+ if self.pretrained: # load pretrained weights
+ load_ckpt(self, self.pretrained)
+
+ def forward(self, inputs, day_or_night='day'):
+ if self.training:
+ features, result = self.encoder(inputs["color_aug", 0, 0], 'day')
+ features_night, result_night = self.encoder(
+ inputs[("color_n_aug", 0, 0)], 'night')
+
+ outputs = self.depth(features)
+ outputs_night = self.depth(features_night)
+ if self.use_pose_net and not self.only_depth_encoder:
+ outputs.update(self.predict_poses(inputs, 'day'))
+ outputs_night.update(self.predict_poses(inputs, 'night'))
+
+ self.generate_images_pred(inputs, outputs, 'day')
+ self.generate_images_pred(inputs, outputs_night, 'night')
+
+ outputs['frame_ids'] = self.frame_ids
+ outputs['scales'] = self.scales
+ outputs['result'] = result
+ outputs['result_night'] = result_night
+ outputs_night['frame_ids'] = self.frame_ids
+ outputs_night['scales'] = self.scales
+ outputs['outputs_night'] = outputs_night
+ else:
+ if isinstance(inputs, dict):
+ input_color = inputs[("color", 0, 0)]
+ features = self.encoder(input_color, day_or_night[0])
+ outputs = self.depth(features)
+
+ pred_disp, _ = disp_to_depth(outputs[("disp", 0)],
+ self.min_depth, self.max_depth)
+
+ pred_disp = pred_disp[:, 0].numpy()
+
+ outputs['pred_disp'] = np.squeeze(pred_disp)
+
+ outputs['gt'] = np.squeeze(inputs['depth_gt'].numpy())
+ else:
+ input_color = inputs
+ features = self.encoder(input_color, day_or_night)
+ outputs = self.depth(features)
+
+ pred_disp, _ = disp_to_depth(outputs[("disp", 0)],
+ self.min_depth, self.max_depth)
+
+ pred_disp = pred_disp[:, 0]
+ outputs = paddle.squeeze(pred_disp)
+ return outputs
+
+ def predict_poses(self, inputs, is_night):
+ """Predict poses between input frames for monocular sequences.
+ """
+ outputs = {}
+ if self.num_pose_frames == 2:
+ if is_night:
+ pose_feats = {
+ f_i: inputs["color_n_aug", f_i, 0]
+ for f_i in self.frame_ids
+ }
+ else:
+ pose_feats = {
+ f_i: inputs["color_aug", f_i, 0]
+ for f_i in self.frame_ids
+ }
+
+ for f_i in self.frame_ids[1:]:
+ if f_i != "s":
+ if f_i < 0:
+ pose_inputs = [pose_feats[f_i], pose_feats[0]]
+ else:
+ pose_inputs = [pose_feats[0], pose_feats[f_i]]
+
+ if self.pose_model_type == "separate_resnet":
+ pose_inputs = [
+ self.pose_encoder(paddle.concat(pose_inputs,
+ axis=1))
+ ]
+
+ axisangle, translation = self.pose(pose_inputs)
+ outputs[("axisangle", 0, f_i)] = axisangle
+ outputs[("translation", 0, f_i)] = translation
+
+ # Invert the matrix if the frame id is negative
+ outputs[("cam_T_cam", 0,
+ f_i)] = transformation_from_parameters(
+ axisangle[:, 0],
+ translation[:, 0],
+ invert=(f_i < 0))
+ return outputs
+
+ def generate_images_pred(self, inputs, outputs, is_night):
+ """Generate the warped (reprojected) color images for a minibatch.
+ Generated images are saved into the `outputs` dictionary.
+ """
+ _, _, height, width = inputs['color', 0, 0].shape
+ for scale in self.scales:
+ disp = outputs[("disp", scale)]
+ if self.v1_multiscale:
+ source_scale = scale
+ else:
+ disp = F.interpolate(disp, [height, width],
+ mode="bilinear",
+ align_corners=False)
+ source_scale = 0
+
+ _, depth = disp_to_depth(disp, self.min_depth, self.max_depth)
+
+ outputs[("depth", 0, scale)] = depth
+ for i, frame_id in enumerate(self.frame_ids[1:]):
+
+ T = outputs[("cam_T_cam", 0, frame_id)]
+
+ cam_points = self.backproject_depth[source_scale](
+ depth, inputs[("inv_K", source_scale)])
+ pix_coords = self.project_3d[source_scale](
+ cam_points, inputs[("K", source_scale)], T)
+
+ outputs[("sample", frame_id, scale)] = pix_coords
+
+ if is_night:
+ inputs[("color_n", frame_id,
+ source_scale)].stop_gradient = False
+ outputs[("color", frame_id,
+ scale)] = paddle.nn.functional.grid_sample(
+ inputs[("color_n", frame_id, source_scale)],
+ outputs[("sample", frame_id, scale)],
+ padding_mode="border",
+ align_corners=False)
+
+ else:
+ inputs[("color", frame_id,
+ source_scale)].stop_gradient = False
+ outputs[("color", frame_id,
+ scale)] = paddle.nn.functional.grid_sample(
+ inputs[("color", frame_id, source_scale)],
+ outputs[("sample", frame_id, scale)],
+ padding_mode="border",
+ align_corners=False)
+
+ if not self.disable_automasking:
+ if is_night:
+ outputs[("color_identity", frame_id, scale)] = \
+ inputs[("color_n", frame_id, source_scale)]
+ else:
+ outputs[("color_identity", frame_id, scale)] = \
+ inputs[("color", frame_id, source_scale)]
diff --git a/paddlevideo/modeling/backbones/agcn.py b/paddlevideo/modeling/backbones/agcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f870c66b9bf0e14fbcf76c91ce14bd2e93e2685
--- /dev/null
+++ b/paddlevideo/modeling/backbones/agcn.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+
+
+class GCN(nn.Layer):
+ def __init__(self, in_channels, out_channels, vertex_nums=25, stride=1):
+ super(GCN, self).__init__()
+ self.conv1 = nn.Conv2D(in_channels=in_channels,
+ out_channels=3 * out_channels,
+ kernel_size=1,
+ stride=1)
+ self.conv2 = nn.Conv2D(in_channels=vertex_nums * 3,
+ out_channels=vertex_nums,
+ kernel_size=1)
+
+ def forward(self, x):
+ # x --- N,C,T,V
+ x = self.conv1(x) # N,3C,T,V
+ N, C, T, V = x.shape
+ x = paddle.reshape(x, [N, C // 3, 3, T, V]) # N,C,3,T,V
+ x = paddle.transpose(x, perm=[0, 1, 2, 4, 3]) # N,C,3,V,T
+ x = paddle.reshape(x, [N, C // 3, 3 * V, T]) # N,C,3V,T
+ x = paddle.transpose(x, perm=[0, 2, 1, 3]) # N,3V,C,T
+ x = self.conv2(x) # N,V,C,T
+ x = paddle.transpose(x, perm=[0, 2, 3, 1]) # N,C,T,V
+ return x
+
+
+class Block(paddle.nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ vertex_nums=25,
+ temporal_size=9,
+ stride=1,
+ residual=True):
+ super(Block, self).__init__()
+ self.residual = residual
+ self.out_channels = out_channels
+
+ self.bn_res = nn.BatchNorm2D(out_channels)
+ self.conv_res = nn.Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=(stride, 1))
+ self.gcn = GCN(in_channels=in_channels,
+ out_channels=out_channels,
+ vertex_nums=vertex_nums)
+ self.tcn = nn.Sequential(
+ nn.BatchNorm2D(out_channels),
+ nn.ReLU(),
+ nn.Conv2D(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=(temporal_size, 1),
+ padding=((temporal_size - 1) // 2, 0),
+ stride=(stride, 1)),
+ nn.BatchNorm2D(out_channels),
+ )
+
+ def forward(self, x):
+ if self.residual:
+ y = self.conv_res(x)
+ y = self.bn_res(y)
+ x = self.gcn(x)
+ x = self.tcn(x)
+ out = x + y if self.residual else x
+ out = F.relu(out)
+ return out
+
+
+@BACKBONES.register()
+class AGCN(nn.Layer):
+ """
+ AGCN model improves the performance of ST-GCN using
+ Adaptive Graph Convolutional Networks.
+ Args:
+ in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.
+ """
+ def __init__(self, in_channels=2, **kwargs):
+ super(AGCN, self).__init__()
+
+ self.data_bn = nn.BatchNorm1D(25 * 2)
+ self.agcn = nn.Sequential(
+ Block(in_channels=in_channels,
+ out_channels=64,
+ residual=False,
+ **kwargs), Block(in_channels=64, out_channels=64, **kwargs),
+ Block(in_channels=64, out_channels=64, **kwargs),
+ Block(in_channels=64, out_channels=64, **kwargs),
+ Block(in_channels=64, out_channels=128, stride=2, **kwargs),
+ Block(in_channels=128, out_channels=128, **kwargs),
+ Block(in_channels=128, out_channels=128, **kwargs),
+ Block(in_channels=128, out_channels=256, stride=2, **kwargs),
+ Block(in_channels=256, out_channels=256, **kwargs),
+ Block(in_channels=256, out_channels=256, **kwargs))
+
+ self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))
+
+ def forward(self, x):
+ # data normalization
+ N, C, T, V, M = x.shape
+
+ x = x.transpose((0, 4, 1, 2, 3)) # N, M, C, T, V
+ x = x.reshape((N * M, C, T, V))
+
+ x = self.agcn(x)
+
+ x = self.pool(x) # NM,C,T,V --> NM,C,1,1
+ C = x.shape[1]
+ x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1) # N,C,1,1
+
+ return x
diff --git a/paddlevideo/modeling/backbones/asrf.py b/paddlevideo/modeling/backbones/asrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..37437b3edc47b1fc745176fa3254865acfe5efb2
--- /dev/null
+++ b/paddlevideo/modeling/backbones/asrf.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yabufarha/ms-tcn/blob/master/model.py
+# https://github.com/yiskw713/asrf/libs/models/tcn.py
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import copy
+import random
+import math
+
+from paddle import ParamAttr
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from .ms_tcn import DilatedResidualLayer
+from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch
+
+
+@BACKBONES.register()
+class ASRF(nn.Layer):
+
+ def __init__(self, in_channel, num_features, num_classes, num_stages,
+ num_layers):
+ super().__init__()
+ self.in_channel = in_channel
+ self.num_features = num_features
+ self.num_classes = num_classes
+ self.num_stages = num_stages
+ self.num_layers = num_layers
+
+ # define layers
+ self.conv_in = nn.Conv1D(self.in_channel, self.num_features, 1)
+
+ shared_layers = [
+ DilatedResidualLayer(2**i, self.num_features, self.num_features)
+ for i in range(self.num_layers)
+ ]
+ self.shared_layers = nn.LayerList(shared_layers)
+
+ self.init_weights()
+
+ def init_weights(self):
+ """
+ initialize model layers' weight
+ """
+ # init weight
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv1D):
+ layer.weight.set_value(
+ KaimingUniform_like_torch(layer.weight).astype('float32'))
+ if layer.bias is not None:
+ layer.bias.set_value(
+ init_bias(layer.weight, layer.bias).astype('float32'))
+
+ def forward(self, x):
+ """ ASRF forward
+ """
+ out = self.conv_in(x)
+ for layer in self.shared_layers:
+ out = layer(out)
+ return out
diff --git a/paddlevideo/modeling/backbones/bmn.py b/paddlevideo/modeling/backbones/bmn.py
new file mode 100644
index 0000000000000000000000000000000000000000..200d1920a4afad43c25cb384ee0c6870022925f3
--- /dev/null
+++ b/paddlevideo/modeling/backbones/bmn.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+import paddle
+from paddle import ParamAttr
+from ..registry import BACKBONES
+
+
+def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
+ num_sample_perbin):
+ """ generate sample mask for a boundary-matching pair """
+ plen = float(seg_xmax - seg_xmin)
+ plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
+ total_samples = [
+ seg_xmin + plen_sample * ii
+ for ii in range(num_sample * num_sample_perbin)
+ ]
+ p_mask = []
+ for idx in range(num_sample):
+ bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
+ num_sample_perbin]
+ bin_vector = np.zeros([tscale])
+ for sample in bin_samples:
+ sample_upper = math.ceil(sample)
+ sample_decimal, sample_down = math.modf(sample)
+ if (tscale - 1) >= int(sample_down) >= 0:
+ bin_vector[int(sample_down)] += 1 - sample_decimal
+ if (tscale - 1) >= int(sample_upper) >= 0:
+ bin_vector[int(sample_upper)] += sample_decimal
+ bin_vector = 1.0 / num_sample_perbin * bin_vector
+ p_mask.append(bin_vector)
+ p_mask = np.stack(p_mask, axis=1)
+ return p_mask
+
+
+def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
+ num_sample_perbin):
+ """ generate sample mask for each point in Boundary-Matching Map """
+ mask_mat = []
+ for start_index in range(tscale):
+ mask_mat_vector = []
+ for duration_index in range(dscale):
+ if start_index + duration_index < tscale:
+ p_xmin = start_index
+ p_xmax = start_index + duration_index
+ center_len = float(p_xmax - p_xmin) + 1
+ sample_xmin = p_xmin - center_len * prop_boundary_ratio
+ sample_xmax = p_xmax + center_len * prop_boundary_ratio
+ p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
+ tscale, num_sample,
+ num_sample_perbin)
+ else:
+ p_mask = np.zeros([tscale, num_sample])
+ mask_mat_vector.append(p_mask)
+ mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+ mask_mat.append(mask_mat_vector)
+ mask_mat = np.stack(mask_mat, axis=3)
+ mask_mat = mask_mat.astype(np.float32)
+
+ sample_mask = np.reshape(mask_mat, [tscale, -1])
+ return sample_mask
+
+
+def init_params(name, in_channels, kernel_size):
+ fan_in = in_channels * kernel_size * 1
+ k = 1. / math.sqrt(fan_in)
+ param_attr = ParamAttr(name=name,
+ initializer=paddle.nn.initializer.Uniform(low=-k,
+ high=k))
+ return param_attr
+
+
+@BACKBONES.register()
+class BMN(paddle.nn.Layer):
+ """BMN model from
+ `"BMN: Boundary-Matching Network for Temporal Action Proposal Generation" `_
+ Args:
+ tscale (int): sequence length, default 100.
+ dscale (int): max duration length, default 100.
+ prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5.
+ num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.
+ num_sample_perbin (int): number of selected points in each sample, default 3.
+ """
+
+ def __init__(
+ self,
+ tscale,
+ dscale,
+ prop_boundary_ratio,
+ num_sample,
+ num_sample_perbin,
+ feat_dim=400,
+ ):
+ super(BMN, self).__init__()
+
+ #init config
+ self.feat_dim = feat_dim
+ self.tscale = tscale
+ self.dscale = dscale
+ self.prop_boundary_ratio = prop_boundary_ratio
+ self.num_sample = num_sample
+ self.num_sample_perbin = num_sample_perbin
+
+ self.hidden_dim_1d = 256
+ self.hidden_dim_2d = 128
+ self.hidden_dim_3d = 512
+
+ # Base Module
+ self.b_conv1 = paddle.nn.Conv1D(
+ in_channels=self.feat_dim,
+ out_channels=self.hidden_dim_1d,
+ kernel_size=3,
+ padding=1,
+ groups=4,
+ weight_attr=init_params('Base_1_w', self.feat_dim, 3),
+ bias_attr=init_params('Base_1_b', self.feat_dim, 3))
+ self.b_conv1_act = paddle.nn.ReLU()
+
+ self.b_conv2 = paddle.nn.Conv1D(
+ in_channels=self.hidden_dim_1d,
+ out_channels=self.hidden_dim_1d,
+ kernel_size=3,
+ padding=1,
+ groups=4,
+ weight_attr=init_params('Base_2_w', self.hidden_dim_1d, 3),
+ bias_attr=init_params('Base_2_b', self.hidden_dim_1d, 3))
+ self.b_conv2_act = paddle.nn.ReLU()
+
+ # Temporal Evaluation Module
+ self.ts_conv1 = paddle.nn.Conv1D(
+ in_channels=self.hidden_dim_1d,
+ out_channels=self.hidden_dim_1d,
+ kernel_size=3,
+ padding=1,
+ groups=4,
+ weight_attr=init_params('TEM_s1_w', self.hidden_dim_1d, 3),
+ bias_attr=init_params('TEM_s1_b', self.hidden_dim_1d, 3))
+ self.ts_conv1_act = paddle.nn.ReLU()
+
+ self.ts_conv2 = paddle.nn.Conv1D(
+ in_channels=self.hidden_dim_1d,
+ out_channels=1,
+ kernel_size=1,
+ padding=0,
+ groups=1,
+ weight_attr=init_params('TEM_s2_w', self.hidden_dim_1d, 1),
+ bias_attr=init_params('TEM_s2_b', self.hidden_dim_1d, 1))
+ self.ts_conv2_act = paddle.nn.Sigmoid()
+
+ self.te_conv1 = paddle.nn.Conv1D(
+ in_channels=self.hidden_dim_1d,
+ out_channels=self.hidden_dim_1d,
+ kernel_size=3,
+ padding=1,
+ groups=4,
+ weight_attr=init_params('TEM_e1_w', self.hidden_dim_1d, 3),
+ bias_attr=init_params('TEM_e1_b', self.hidden_dim_1d, 3))
+ self.te_conv1_act = paddle.nn.ReLU()
+ self.te_conv2 = paddle.nn.Conv1D(
+ in_channels=self.hidden_dim_1d,
+ out_channels=1,
+ kernel_size=1,
+ padding=0,
+ groups=1,
+ weight_attr=init_params('TEM_e2_w', self.hidden_dim_1d, 1),
+ bias_attr=init_params('TEM_e2_b', self.hidden_dim_1d, 1))
+ self.te_conv2_act = paddle.nn.Sigmoid()
+
+ #Proposal Evaluation Module
+ self.p_conv1 = paddle.nn.Conv1D(
+ in_channels=self.hidden_dim_1d,
+ out_channels=self.hidden_dim_2d,
+ kernel_size=3,
+ padding=1,
+ groups=1,
+ weight_attr=init_params('PEM_1d_w', self.hidden_dim_1d, 3),
+ bias_attr=init_params('PEM_1d_b', self.hidden_dim_1d, 3))
+ self.p_conv1_act = paddle.nn.ReLU()
+
+ # init to speed up
+ sample_mask = get_interp1d_mask(self.tscale, self.dscale,
+ self.prop_boundary_ratio,
+ self.num_sample, self.num_sample_perbin)
+ self.sample_mask = paddle.to_tensor(sample_mask)
+ self.sample_mask.stop_gradient = True
+
+ self.p_conv3d1 = paddle.nn.Conv3D(
+ in_channels=128,
+ out_channels=self.hidden_dim_3d,
+ kernel_size=(self.num_sample, 1, 1),
+ stride=(self.num_sample, 1, 1),
+ padding=0,
+ weight_attr=ParamAttr(name="PEM_3d1_w"),
+ bias_attr=ParamAttr(name="PEM_3d1_b"))
+ self.p_conv3d1_act = paddle.nn.ReLU()
+
+ self.p_conv2d1 = paddle.nn.Conv2D(
+ in_channels=512,
+ out_channels=self.hidden_dim_2d,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ weight_attr=ParamAttr(name="PEM_2d1_w"),
+ bias_attr=ParamAttr(name="PEM_2d1_b"))
+ self.p_conv2d1_act = paddle.nn.ReLU()
+
+ self.p_conv2d2 = paddle.nn.Conv2D(
+ in_channels=128,
+ out_channels=self.hidden_dim_2d,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ weight_attr=ParamAttr(name="PEM_2d2_w"),
+ bias_attr=ParamAttr(name="PEM_2d2_b"))
+ self.p_conv2d2_act = paddle.nn.ReLU()
+
+ self.p_conv2d3 = paddle.nn.Conv2D(
+ in_channels=128,
+ out_channels=self.hidden_dim_2d,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ weight_attr=ParamAttr(name="PEM_2d3_w"),
+ bias_attr=ParamAttr(name="PEM_2d3_b"))
+ self.p_conv2d3_act = paddle.nn.ReLU()
+
+ self.p_conv2d4 = paddle.nn.Conv2D(
+ in_channels=128,
+ out_channels=2,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ weight_attr=ParamAttr(name="PEM_2d4_w"),
+ bias_attr=ParamAttr(name="PEM_2d4_b"))
+ self.p_conv2d4_act = paddle.nn.Sigmoid()
+
+ def init_weights(self):
+ pass
+
+ def forward(self, x):
+ #Base Module
+ x = self.b_conv1(x)
+ x = self.b_conv1_act(x)
+ x = self.b_conv2(x)
+ x = self.b_conv2_act(x)
+
+ #TEM
+ xs = self.ts_conv1(x)
+ xs = self.ts_conv1_act(xs)
+ xs = self.ts_conv2(xs)
+ xs = self.ts_conv2_act(xs)
+ xs = paddle.squeeze(xs, axis=[1])
+ xe = self.te_conv1(x)
+ xe = self.te_conv1_act(xe)
+ xe = self.te_conv2(xe)
+ xe = self.te_conv2_act(xe)
+ xe = paddle.squeeze(xe, axis=[1])
+
+ #PEM
+ xp = self.p_conv1(x)
+ xp = self.p_conv1_act(xp)
+ #BM layer
+ xp = paddle.matmul(xp, self.sample_mask)
+ xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])
+
+ xp = self.p_conv3d1(xp)
+ xp = self.p_conv3d1_act(xp)
+ xp = paddle.squeeze(xp, axis=[2])
+ xp = self.p_conv2d1(xp)
+ xp = self.p_conv2d1_act(xp)
+ xp = self.p_conv2d2(xp)
+ xp = self.p_conv2d2_act(xp)
+ xp = self.p_conv2d3(xp)
+ xp = self.p_conv2d3_act(xp)
+ xp = self.p_conv2d4(xp)
+ xp = self.p_conv2d4_act(xp)
+ return xp, xs, xe
diff --git a/paddlevideo/modeling/backbones/cfbi.py b/paddlevideo/modeling/backbones/cfbi.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbf044b7fdf796ea5c91cbbf538dbb8d160a942
--- /dev/null
+++ b/paddlevideo/modeling/backbones/cfbi.py
@@ -0,0 +1,88 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+from .deeplab import DeepLab
+
+
+class FPN(nn.Layer):
+ """FPN Layer"""
+ def __init__(self, in_dim_4x, in_dim_8x, in_dim_16x, out_dim):
+ super(FPN, self).__init__()
+ self.toplayer = self._make_layer(in_dim_16x, out_dim)
+ self.latlayer1 = self._make_layer(in_dim_8x, out_dim)
+ self.latlayer2 = self._make_layer(in_dim_4x, out_dim)
+
+ self.smooth1 = self._make_layer(out_dim,
+ out_dim,
+ kernel_size=3,
+ padding=1)
+ self.smooth2 = self._make_layer(out_dim,
+ out_dim,
+ kernel_size=3,
+ padding=1)
+
+ def _make_layer(self, in_dim, out_dim, kernel_size=1, padding=0):
+ return nn.Sequential(
+ nn.Conv2D(in_dim,
+ out_dim,
+ kernel_size=kernel_size,
+ stride=1,
+ padding=padding,
+ bias_attr=False),
+ nn.GroupNorm(num_groups=32, num_channels=out_dim))
+
+ def forward(self, x_4x, x_8x, x_16x):
+ """ forward function"""
+ x_16x = self.toplayer(x_16x)
+ x_8x = self.latlayer1(x_8x)
+ x_4x = self.latlayer2(x_4x)
+
+ x_8x = x_8x + F.interpolate(
+ x_16x, size=x_8x.shape[-2:], mode='bilinear', align_corners=True)
+ x_4x = x_4x + F.interpolate(
+ x_8x, size=x_4x.shape[-2:], mode='bilinear', align_corners=True)
+
+ x_8x = self.smooth1(x_8x)
+ x_4x = self.smooth2(x_4x)
+
+ return F.relu(x_4x), F.relu(x_8x), F.relu(x_16x)
+
+
+@BACKBONES.register()
+class CFBI(nn.Layer):
+ """CFBI plus backbone"""
+ def __init__(self,
+ backbone='resnet',
+ freeze_bn=True,
+ model_aspp_outdim=256,
+ in_dim_8x=512,
+ model_semantic_embedding_dim=256): #,epsilon=1e-05):
+ super(CFBI, self).__init__()
+ #self.epsilon = epsilon
+ self.feature_extracter = DeepLab(backbone=backbone, freeze_bn=freeze_bn)
+ self.fpn = FPN(in_dim_4x=model_aspp_outdim,
+ in_dim_8x=in_dim_8x,
+ in_dim_16x=model_aspp_outdim,
+ out_dim=model_semantic_embedding_dim)
+
+ def forward(self, x):
+ """forward function"""
+ x, aspp_x, low_level, mid_level = self.feature_extracter(x, True)
+ x_4x, x_8x, x_16x = self.fpn(x, mid_level, aspp_x)
+ return x_4x, x_8x, x_16x, low_level
diff --git a/paddlevideo/modeling/backbones/ctrgcn.py b/paddlevideo/modeling/backbones/ctrgcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d645f4e98fb231eb09b2a3248884c4068acfd7f
--- /dev/null
+++ b/paddlevideo/modeling/backbones/ctrgcn.py
@@ -0,0 +1,514 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def conv_init(conv):
+ if conv.weight is not None:
+ weight_init_(conv.weight, 'kaiming_normal_', mode='fan_in')
+ if conv.bias is not None:
+ nn.initializer.Constant(value=0.0)(conv.bias)
+
+
+def bn_init(bn, scale):
+ nn.initializer.Constant(value=float(scale))(bn.weight)
+ nn.initializer.Constant(value=0.0)(bn.bias)
+
+
+def einsum(x1, x3):
+ """paddle.einsum only support in dynamic graph mode.
+ x1 : n c u v
+ x2 : n c t v
+ """
+ n, c, u, v1 = x1.shape
+ n, c, t, v3 = x3.shape
+ assert (v1 == v3), "Args of einsum not match!"
+ x1 = paddle.transpose(x1, perm=[0, 1, 3, 2]) # n c v u
+ y = paddle.matmul(x3, x1)
+ # out: n c t u
+ return y
+
+
+class CTRGC(nn.Layer):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ rel_reduction=8,
+ mid_reduction=1):
+ super(CTRGC, self).__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ if in_channels == 3 or in_channels == 9:
+ self.rel_channels = 8
+ self.mid_channels = 16
+ else:
+ self.rel_channels = in_channels // rel_reduction
+ self.mid_channels = in_channels // mid_reduction
+ self.conv1 = nn.Conv2D(self.in_channels,
+ self.rel_channels,
+ kernel_size=1)
+ self.conv2 = nn.Conv2D(self.in_channels,
+ self.rel_channels,
+ kernel_size=1)
+ self.conv3 = nn.Conv2D(self.in_channels,
+ self.out_channels,
+ kernel_size=1)
+ self.conv4 = nn.Conv2D(self.rel_channels,
+ self.out_channels,
+ kernel_size=1)
+ self.tanh = nn.Tanh()
+
+ def init_weights(self):
+ """Initiate the parameters.
+ """
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ conv_init(m)
+ elif isinstance(m, nn.BatchNorm2D):
+ bn_init(m, 1)
+
+ def forward(self, x, A=None, alpha=1):
+ x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(-2), self.conv3(
+ x)
+ x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))
+ x1 = self.conv4(x1) * alpha + (
+ A.unsqueeze(0).unsqueeze(0) if A is not None else 0) # N,C,V,V
+ # We only support 'paddle.einsum()' in dynamic graph mode, if use in infer model please implement self.
+ # x1 = paddle.einsum('ncuv,nctv->nctu', x1, x3)
+ x1 = einsum(x1, x3)
+ return x1
+
+
+class TemporalConv(nn.Layer):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ dilation=1):
+ super(TemporalConv, self).__init__()
+ pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2
+ self.conv = nn.Conv2D(in_channels,
+ out_channels,
+ kernel_size=(kernel_size, 1),
+ padding=(pad, 0),
+ stride=(stride, 1),
+ dilation=(dilation, 1))
+
+ self.bn = nn.BatchNorm2D(out_channels)
+
+ def forward(self, x):
+ x = self.conv(x)
+ x = self.bn(x)
+ return x
+
+
+class MultiScale_TemporalConv(nn.Layer):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ dilations=[1, 2, 3, 4],
+ residual=True,
+ residual_kernel_size=1):
+
+ super(MultiScale_TemporalConv, self).__init__()
+ assert out_channels % (
+ len(dilations) +
+ 2) == 0, '# out channels should be multiples of # branches'
+
+ # Multiple branches of temporal convolution
+ self.num_branches = len(dilations) + 2
+ branch_channels = out_channels // self.num_branches
+ if type(kernel_size) == list:
+ assert len(kernel_size) == len(dilations)
+ else:
+ kernel_size = [kernel_size] * len(dilations)
+ # Temporal Convolution branches
+ self.branches = nn.LayerList([
+ nn.Sequential(
+ nn.Conv2D(in_channels,
+ branch_channels,
+ kernel_size=1,
+ padding=0),
+ nn.BatchNorm2D(branch_channels),
+ nn.ReLU(),
+ TemporalConv(branch_channels,
+ branch_channels,
+ kernel_size=ks,
+ stride=stride,
+ dilation=dilation),
+ ) for ks, dilation in zip(kernel_size, dilations)
+ ])
+
+ # Additional Max & 1x1 branch
+ self.branches.append(
+ nn.Sequential(
+ nn.Conv2D(in_channels,
+ branch_channels,
+ kernel_size=1,
+ padding=0), nn.BatchNorm2D(branch_channels),
+ nn.ReLU(),
+ nn.MaxPool2D(kernel_size=(3, 1),
+ stride=(stride, 1),
+ padding=(1, 0)), nn.BatchNorm2D(branch_channels)))
+
+ self.branches.append(
+ nn.Sequential(
+ nn.Conv2D(in_channels,
+ branch_channels,
+ kernel_size=1,
+ padding=0,
+ stride=(stride, 1)), nn.BatchNorm2D(branch_channels)))
+
+ # Residual connection
+ if not residual:
+ self.residual = lambda x: 0
+ elif (in_channels == out_channels) and (stride == 1):
+ self.residual = lambda x: x
+ else:
+ self.residual = TemporalConv(in_channels,
+ out_channels,
+ kernel_size=residual_kernel_size,
+ stride=stride)
+
+ def init_weights(self):
+ """Initiate the parameters.
+ """
+ # initialize
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ conv_init(m)
+ elif isinstance(m, nn.BatchNorm2D):
+ weight_init_(m.weight, 'Normal', std=0.02, mean=1.0)
+ nn.initializer.Constant(value=0.0)(m.bias)
+
+ def forward(self, x):
+ # Input dim: (N,C,T,V)
+ res = self.residual(x)
+ branch_outs = []
+ for tempconv in self.branches:
+ out = tempconv(x)
+ branch_outs.append(out)
+
+ out = paddle.concat(branch_outs, axis=1)
+ out += res
+ return out
+
+
+class unit_tcn(nn.Layer):
+
+ def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
+ super(unit_tcn, self).__init__()
+ pad = int((kernel_size - 1) / 2)
+ self.conv = nn.Conv2D(in_channels,
+ out_channels,
+ kernel_size=(kernel_size, 1),
+ padding=(pad, 0),
+ stride=(stride, 1))
+
+ self.bn = nn.BatchNorm2D(out_channels)
+ self.relu = nn.ReLU()
+ conv_init(self.conv)
+ bn_init(self.bn, 1)
+
+ def forward(self, x):
+ x = self.bn(self.conv(x))
+ return x
+
+
+class unit_gcn(nn.Layer):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ A,
+ coff_embedding=4,
+ adaptive=True,
+ residual=True):
+ super(unit_gcn, self).__init__()
+ inter_channels = out_channels // coff_embedding
+ self.inter_c = inter_channels
+ self.out_c = out_channels
+ self.in_c = in_channels
+ self.adaptive = adaptive
+ self.num_subset = A.shape[0]
+ self.convs = nn.LayerList()
+
+ for i in range(self.num_subset):
+ self.convs.append(CTRGC(in_channels, out_channels))
+
+ if residual:
+ if in_channels != out_channels:
+ self.down = nn.Sequential(
+ nn.Conv2D(in_channels, out_channels, 1),
+ nn.BatchNorm2D(out_channels))
+ else:
+ self.down = lambda x: x
+ else:
+ self.down = lambda x: 0
+ if self.adaptive:
+ pa_param = paddle.ParamAttr(
+ initializer=paddle.nn.initializer.Assign(A.astype(np.float32)))
+ self.PA = paddle.create_parameter(shape=A.shape,
+ dtype='float32',
+ attr=pa_param)
+ else:
+ A_tensor = paddle.to_tensor(A, dtype="float32")
+ self.A = paddle.create_parameter(
+ shape=A_tensor.shape,
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Assign(A_tensor))
+ self.A.stop_gradient = True
+ alpha_tensor = paddle.to_tensor(np.zeros(1), dtype="float32")
+ self.alpha = paddle.create_parameter(
+ shape=alpha_tensor.shape,
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Assign(alpha_tensor))
+ self.bn = nn.BatchNorm2D(out_channels)
+ self.soft = nn.Softmax(-2)
+ self.relu = nn.ReLU()
+
+ def init_weights(self):
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ conv_init(m)
+ elif isinstance(m, nn.BatchNorm2D):
+ bn_init(m, 1)
+ bn_init(self.bn, 1e-6)
+
+ def forward(self, x):
+ y = None
+ if self.adaptive:
+ A = self.PA
+ else:
+ A = self.A.cuda(x.get_device())
+ for i in range(self.num_subset):
+ z = self.convs[i](x, A[i], self.alpha)
+ y = z + y if y is not None else z
+ y = self.bn(y)
+ y += self.down(x)
+ y = self.relu(y)
+ return y
+
+
+class TCN_GCN_unit(nn.Layer):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ A,
+ stride=1,
+ residual=True,
+ adaptive=True,
+ kernel_size=5,
+ dilations=[1, 2]):
+ super(TCN_GCN_unit, self).__init__()
+ self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive)
+ self.tcn1 = MultiScale_TemporalConv(out_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ dilations=dilations,
+ residual=False)
+ self.relu = nn.ReLU()
+ if not residual:
+ self.residual = lambda x: 0
+
+ elif (in_channels == out_channels) and (stride == 1):
+ self.residual = lambda x: x
+
+ else:
+ self.residual = unit_tcn(in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=stride)
+
+ def forward(self, x):
+ y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
+ return y
+
+
+class NTUDGraph:
+
+ def __init__(self, labeling_mode='spatial'):
+ num_node = 25
+ self_link = [(i, i) for i in range(num_node)]
+ inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+ (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+ (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+ (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+ (23, 8), (24, 25), (25, 12)]
+ inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]
+ outward = [(j, i) for (i, j) in inward]
+ neighbor = inward + outward
+
+ self.num_node = num_node
+ self.self_link = self_link
+ self.inward = inward
+ self.outward = outward
+ self.neighbor = neighbor
+ self.A = self.get_adjacency_matrix(labeling_mode)
+
+ def edge2mat(self, link, num_node):
+ A = np.zeros((num_node, num_node))
+ for i, j in link:
+ A[j, i] = 1
+ return A
+
+ def normalize_digraph(self, A):
+ Dl = np.sum(A, 0)
+ h, w = A.shape
+ Dn = np.zeros((w, w))
+ for i in range(w):
+ if Dl[i] > 0:
+ Dn[i, i] = Dl[i]**(-1)
+ AD = np.dot(A, Dn)
+ return AD
+
+ def get_spatial_graph(self, num_node, self_link, inward, outward):
+ I = self.edge2mat(self_link, num_node)
+ In = self.normalize_digraph(self.edge2mat(inward, num_node))
+ Out = self.normalize_digraph(self.edge2mat(outward, num_node))
+ A = np.stack((I, In, Out))
+ return A
+
+ def get_adjacency_matrix(self, labeling_mode=None):
+ if labeling_mode is None:
+ return self.A
+ if labeling_mode == 'spatial':
+ A = self.get_spatial_graph(self.num_node, self.self_link,
+ self.inward, self.outward)
+ else:
+ raise ValueError()
+ return A
+
+
+@BACKBONES.register()
+class CTRGCN(nn.Layer):
+ """
+ CTR-GCN model from:
+ `"Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition" `_
+ Args:
+ num_point: int, numbers of sketeton point.
+ num_person: int, numbers of person.
+ base_channel: int, model's hidden dim.
+ graph: str, sketeton adjacency matrix name.
+ graph_args: dict, sketeton adjacency graph class args.
+ in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 3.
+ adaptive: bool, if adjacency matrix can adaptive.
+ """
+
+ def __init__(self,
+ num_point=25,
+ num_person=2,
+ base_channel=64,
+ graph='ntu_rgb_d',
+ graph_args=dict(),
+ in_channels=3,
+ adaptive=True):
+ super(CTRGCN, self).__init__()
+
+ if graph == 'ntu_rgb_d':
+ self.graph = NTUDGraph(**graph_args)
+ else:
+ raise ValueError()
+
+ A = self.graph.A # 3,25,25
+
+ self.num_point = num_point
+ self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)
+ self.base_channel = base_channel
+
+ self.l1 = TCN_GCN_unit(in_channels,
+ self.base_channel,
+ A,
+ residual=False,
+ adaptive=adaptive)
+ self.l2 = TCN_GCN_unit(self.base_channel,
+ self.base_channel,
+ A,
+ adaptive=adaptive)
+ self.l3 = TCN_GCN_unit(self.base_channel,
+ self.base_channel,
+ A,
+ adaptive=adaptive)
+ self.l4 = TCN_GCN_unit(self.base_channel,
+ self.base_channel,
+ A,
+ adaptive=adaptive)
+ self.l5 = TCN_GCN_unit(self.base_channel,
+ self.base_channel * 2,
+ A,
+ stride=2,
+ adaptive=adaptive)
+ self.l6 = TCN_GCN_unit(self.base_channel * 2,
+ self.base_channel * 2,
+ A,
+ adaptive=adaptive)
+ self.l7 = TCN_GCN_unit(self.base_channel * 2,
+ self.base_channel * 2,
+ A,
+ adaptive=adaptive)
+ self.l8 = TCN_GCN_unit(self.base_channel * 2,
+ self.base_channel * 4,
+ A,
+ stride=2,
+ adaptive=adaptive)
+ self.l9 = TCN_GCN_unit(self.base_channel * 4,
+ self.base_channel * 4,
+ A,
+ adaptive=adaptive)
+ self.l10 = TCN_GCN_unit(self.base_channel * 4,
+ self.base_channel * 4,
+ A,
+ adaptive=adaptive)
+
+ def init_weights(self):
+ bn_init(self.data_bn, 1)
+
+ def forward(self, x):
+ N, C, T, V, M = x.shape
+ x = paddle.transpose(x, perm=[0, 4, 3, 1, 2])
+ x = paddle.reshape(x, (N, M * V * C, T))
+
+ x = self.data_bn(x)
+
+ x = paddle.reshape(x, (N, M, V, C, T))
+ x = paddle.transpose(x, perm=(0, 1, 3, 4, 2))
+
+ x = paddle.reshape(x, (N * M, C, T, V))
+
+ x = self.l1(x)
+ x = self.l2(x)
+ x = self.l3(x)
+ x = self.l4(x)
+ x = self.l5(x)
+ x = self.l6(x)
+ x = self.l7(x)
+ x = self.l8(x)
+ x = self.l9(x)
+ x = self.l10(x)
+
+ return x, N, M
diff --git a/paddlevideo/modeling/backbones/deeplab.py b/paddlevideo/modeling/backbones/deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..c566205ac8bf12b35b5bbf0c4f77e13fbe4f4097
--- /dev/null
+++ b/paddlevideo/modeling/backbones/deeplab.py
@@ -0,0 +1,454 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import copy
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+
+
+class FrozenBatchNorm2D(nn.Layer):
+ """
+ BatchNorm2D where the batch statistics and the affine parameters
+ are fixed
+ """
+ def __init__(self, n, epsilon=1e-5):
+ super(FrozenBatchNorm2D, self).__init__()
+ x1 = paddle.ones([n])
+ x2 = paddle.zeros([n])
+ weight = self.create_parameter(
+ shape=x1.shape, default_initializer=nn.initializer.Assign(x1))
+ bias = self.create_parameter(
+ shape=x2.shape, default_initializer=nn.initializer.Assign(x2))
+ running_mean = self.create_parameter(
+ shape=x2.shape, default_initializer=nn.initializer.Assign(x2))
+ running_var = self.create_parameter(
+ shape=x1.shape, default_initializer=nn.initializer.Assign(x1))
+ self.add_parameter('weight', weight)
+ self.add_parameter('bias', bias)
+ self.add_parameter('running_mean', running_mean)
+ self.add_parameter('running_var', running_var)
+ self.epsilon = epsilon
+
+ def forward(self, x):
+ scale = self.weight * paddle.rsqrt((self.running_var + self.epsilon))
+ bias = self.bias - self.running_mean * scale
+ scale = paddle.reshape(scale, [1, -1, 1, 1])
+ bias = paddle.reshape(bias, [1, -1, 1, 1])
+ return x * scale + bias
+
+
+class Bottleneck(nn.Layer):
+ expansion = 4
+
+ def __init__(self,
+ inplanes,
+ planes,
+ stride=1,
+ dilation=1,
+ downsample=None,
+ BatchNorm=None):
+ super(Bottleneck, self).__init__()
+ self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+ self.bn1 = BatchNorm(planes)
+ self.conv2 = nn.Conv2D(planes,
+ planes,
+ kernel_size=3,
+ stride=stride,
+ dilation=dilation,
+ padding=dilation,
+ bias_attr=False)
+ self.bn2 = BatchNorm(planes)
+ self.conv3 = nn.Conv2D(planes,
+ planes * 4,
+ kernel_size=1,
+ bias_attr=False)
+ self.bn3 = BatchNorm(planes * 4)
+ self.relu = nn.ReLU()
+ self.downsample = downsample
+ self.stride = stride
+ self.dilation = dilation
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class ResNet(nn.Layer):
+ def __init__(self,
+ block,
+ layers,
+ output_stride,
+ BatchNorm,
+ pretrained=False):
+ self.inplanes = 64
+ super(ResNet, self).__init__()
+ blocks = [1, 2, 4]
+ if output_stride == 16:
+ strides = [1, 2, 2, 1]
+ dilations = [1, 1, 1, 2]
+ elif output_stride == 8:
+ strides = [1, 2, 1, 1]
+ dilations = [1, 1, 2, 4]
+ else:
+ raise NotImplementedError
+
+ # Modules
+ self.conv1 = nn.Conv2D(3,
+ 64,
+ kernel_size=7,
+ stride=2,
+ padding=3,
+ bias_attr=False)
+ self.bn1 = BatchNorm(64)
+ self.relu = nn.ReLU()
+ self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+ self.layer1 = self._make_layer(block,
+ 64,
+ layers[0],
+ stride=strides[0],
+ dilation=dilations[0],
+ BatchNorm=BatchNorm)
+ self.layer2 = self._make_layer(block,
+ 128,
+ layers[1],
+ stride=strides[1],
+ dilation=dilations[1],
+ BatchNorm=BatchNorm)
+ self.layer3 = self._make_layer(block,
+ 256,
+ layers[2],
+ stride=strides[2],
+ dilation=dilations[2],
+ BatchNorm=BatchNorm)
+ self.layer4 = self._make_MG_unit(block,
+ 512,
+ blocks=blocks,
+ stride=strides[3],
+ dilation=dilations[3],
+ BatchNorm=BatchNorm)
+ self._init_weight()
+
+ def _make_layer(self,
+ block,
+ planes,
+ blocks,
+ stride=1,
+ dilation=1,
+ BatchNorm=None):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2D(self.inplanes,
+ planes * block.expansion,
+ kernel_size=1,
+ stride=stride,
+ bias_attr=False),
+ BatchNorm(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(
+ block(self.inplanes, planes, stride, dilation, downsample,
+ BatchNorm))
+ self.inplanes = planes * block.expansion
+ for i in range(1, blocks):
+ layers.append(
+ block(self.inplanes,
+ planes,
+ dilation=dilation,
+ BatchNorm=BatchNorm))
+
+ return nn.Sequential(*layers)
+
+ def _make_MG_unit(self,
+ block,
+ planes,
+ blocks,
+ stride=1,
+ dilation=1,
+ BatchNorm=None):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2D(self.inplanes,
+ planes * block.expansion,
+ kernel_size=1,
+ stride=stride,
+ bias_attr=False),
+ BatchNorm(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(
+ block(self.inplanes,
+ planes,
+ stride,
+ dilation=blocks[0] * dilation,
+ downsample=downsample,
+ BatchNorm=BatchNorm))
+ self.inplanes = planes * block.expansion
+ for i in range(1, len(blocks)):
+ layers.append(
+ block(self.inplanes,
+ planes,
+ stride=1,
+ dilation=blocks[i] * dilation,
+ BatchNorm=BatchNorm))
+
+ return nn.Sequential(*layers)
+
+ def forward(self, input, return_mid_level=False):
+ x = self.conv1(input)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.maxpool(x)
+
+ x = self.layer1(x)
+ low_level_feat = x
+ x = self.layer2(x)
+ mid_level_feat = x
+ x = self.layer3(x)
+ x = self.layer4(x)
+ if return_mid_level:
+ return x, low_level_feat, mid_level_feat
+ else:
+ return x, low_level_feat
+
+ def _init_weight(self):
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ nn.initializer.KaimingNormal()
+ elif isinstance(m, nn.GroupNorm):
+ m.weight.data = nn.initializer.Constant(1)
+ m.bias.data = nn.initializer.Constant(0)
+
+
+class _ASPPModule(nn.Layer):
+ def __init__(self, inplanes, planes, kernel_size, padding, dilation,
+ BatchNorm):
+ super(_ASPPModule, self).__init__()
+ self.atrous_conv = nn.Conv2D(inplanes,
+ planes,
+ kernel_size=kernel_size,
+ stride=1,
+ padding=padding,
+ dilation=dilation,
+ bias_attr=False)
+ self.bn = BatchNorm(planes)
+ self.relu = nn.ReLU()
+
+ self._init_weight()
+
+ def forward(self, x):
+ x = self.atrous_conv(x)
+ x = self.bn(x)
+
+ return self.relu(x)
+
+ def _init_weight(self):
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ m.weight_attr = nn.initializer.KaimingNormal()
+ elif isinstance(m, nn.BatchNorm2D):
+ m.weight.data.fill_(1)
+ m.bias.data.zero_()
+
+
+class ASPP(nn.Layer):
+ def __init__(self, backbone, output_stride, BatchNorm):
+ super(ASPP, self).__init__()
+ if backbone == 'drn':
+ inplanes = 512
+ elif backbone == 'mobilenet':
+ inplanes = 320
+ else:
+ inplanes = 2048
+ if output_stride == 16:
+ dilations = [1, 6, 12, 18]
+ elif output_stride == 8:
+ dilations = [1, 12, 24, 36]
+ else:
+ raise NotImplementedError
+
+ self.aspp1 = _ASPPModule(inplanes,
+ 256,
+ 1,
+ padding=0,
+ dilation=dilations[0],
+ BatchNorm=BatchNorm)
+ self.aspp2 = _ASPPModule(inplanes,
+ 256,
+ 3,
+ padding=dilations[1],
+ dilation=dilations[1],
+ BatchNorm=BatchNorm)
+ self.aspp3 = _ASPPModule(inplanes,
+ 256,
+ 3,
+ padding=dilations[2],
+ dilation=dilations[2],
+ BatchNorm=BatchNorm)
+ self.aspp4 = _ASPPModule(inplanes,
+ 256,
+ 3,
+ padding=dilations[3],
+ dilation=dilations[3],
+ BatchNorm=BatchNorm)
+
+ self.global_avg_pool = nn.Sequential(
+ nn.AdaptiveAvgPool2D((1, 1)),
+ nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),
+ BatchNorm(256), nn.ReLU())
+ self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)
+ self.bn1 = BatchNorm(256)
+ self.relu = nn.ReLU()
+ self.dropout = nn.Dropout(0.1)
+ self._init_weight()
+
+ def forward(self, x):
+ x1 = self.aspp1(x)
+ x2 = self.aspp2(x)
+ x3 = self.aspp3(x)
+ x4 = self.aspp4(x)
+ x5 = self.global_avg_pool(x)
+ x5 = F.interpolate(x5,
+ size=x4.shape[2:],
+ mode='bilinear',
+ align_corners=True)
+ x = paddle.concat(x=[x1, x2, x3, x4, x5], axis=1)
+
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+
+ return self.dropout(x)
+
+ def _init_weight(self):
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ nn.initializer.KaimingNormal()
+ elif isinstance(m, nn.GroupNorm):
+ m.weight.data = nn.initializer.Constant(1)
+ m.bias.data = nn.initializer.Constant(0)
+
+
+class Decoder(nn.Layer):
+ def __init__(self, backbone, BatchNorm):
+ super(Decoder, self).__init__()
+ if backbone == 'resnet':
+ low_level_inplanes = 256
+ elif backbone == 'mobilenet':
+ raise NotImplementedError
+ else:
+ raise NotImplementedError
+
+ self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)
+ self.bn1 = BatchNorm(48)
+ self.relu = nn.ReLU()
+
+ self.last_conv = nn.Sequential(
+ nn.Conv2D(304,
+ 256,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False), BatchNorm(256), nn.ReLU(),
+ nn.Sequential(),
+ nn.Conv2D(256,
+ 256,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False), BatchNorm(256), nn.ReLU(),
+ nn.Sequential())
+
+ self._init_weight()
+
+ def forward(self, x, low_level_feat):
+ low_level_feat = self.conv1(low_level_feat)
+ low_level_feat = self.bn1(low_level_feat)
+ low_level_feat = self.relu(low_level_feat)
+
+ x = F.interpolate(x,
+ size=low_level_feat.shape[2:],
+ mode='bilinear',
+ align_corners=True)
+ x = paddle.concat(x=[x, low_level_feat], axis=1)
+ x = self.last_conv(x)
+
+ return x
+
+ def _init_weight(self):
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ nn.initializer.KaimingNormal()
+ elif isinstance(m, nn.GroupNorm):
+ m.weight.data = nn.initializer.Constant(1)
+ m.bias.data = nn.initializer.Constant(0)
+
+
+class DeepLab(nn.Layer):
+ """DeepLab model for segmentation"""
+ def __init__(self, backbone='resnet', output_stride=16, freeze_bn=True):
+ super(DeepLab, self).__init__()
+
+ if freeze_bn == True:
+ print("Use frozen BN in DeepLab!")
+ BatchNorm = FrozenBatchNorm2D
+ else:
+ BatchNorm = nn.BatchNorm2D
+
+ self.backbone = ResNet(Bottleneck, [3, 4, 23, 3],
+ output_stride,
+ BatchNorm,
+ pretrained=True)
+ self.aspp = ASPP(backbone, output_stride, BatchNorm)
+ self.decoder = Decoder(backbone, BatchNorm)
+
+ def forward(self, input, return_aspp=False):
+ """forward function"""
+ if return_aspp:
+ x, low_level_feat, mid_level_feat = self.backbone(input, True)
+ else:
+ x, low_level_feat = self.backbone(input)
+ aspp_x = self.aspp(x)
+ x = self.decoder(aspp_x, low_level_feat)
+
+ if return_aspp:
+ return x, aspp_x, low_level_feat, mid_level_feat
+ else:
+ return x, low_level_feat
diff --git a/paddlevideo/modeling/backbones/movinet.py b/paddlevideo/modeling/backbones/movinet.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6d4fddfa997d7cdc0095c04322f51656323c72
--- /dev/null
+++ b/paddlevideo/modeling/backbones/movinet.py
@@ -0,0 +1,574 @@
+import collections.abc
+from itertools import repeat
+from typing import Any, Callable, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.layer import Identity
+
+from ..registry import BACKBONES
+from collections import OrderedDict
+
+container_abcs = collections.abc
+"""Model Config
+"""
+
+A0 = {'block_num': [0, 1, 3, 3, 4, 4]}
+A0['conv1'] = [3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1)]
+A0['b2_l0'] = [8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1)]
+A0['b3_l0'] = [8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0)]
+A0['b3_l1'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b3_l2'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b4_l0'] = [32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0)]
+A0['b4_l1'] = [56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b4_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l0'] = [56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1)]
+A0['b5_l1'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l3'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b6_l0'] = [56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1)]
+A0['b6_l1'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['b6_l2'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['b6_l3'] = [104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['conv7'] = [104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0)]
+
+MODEL_CONFIG = {'A0': A0}
+
+
+def _ntuple(n):
+ def parse(x):
+ if isinstance(x, container_abcs.Iterable):
+ return x
+ return tuple(repeat(x, n))
+
+ return parse
+
+
+def _make_divisible(v: float,
+ divisor: int,
+ min_value: Optional[int] = None) -> int:
+ """
+ This function is taken from the original tf repo.
+ It ensures that all layers have a channel number that is divisible by 8.
+ It can be seen here:
+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+ """
+ if min_value is None:
+ min_value = divisor
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+ # Make sure that round down does not go down by more than 10%.
+ if new_v < 0.9 * v:
+ new_v += divisor
+ return new_v
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+class CausalModule(nn.Layer):
+ def __init__(self) -> None:
+ super().__init__()
+ self.activation = None
+
+ def reset_activation(self) -> None:
+ self.activation = None
+
+
+class Conv2dBNActivation(nn.Sequential):
+ def __init__(
+ self,
+ in_planes: int,
+ out_planes: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ padding: Union[int, Tuple[int, int]],
+ stride: Union[int, Tuple[int, int]] = 1,
+ groups: int = 1,
+ norm_layer: Optional[Callable[..., nn.Layer]] = None,
+ activation_layer: Optional[Callable[..., nn.Layer]] = None,
+ **kwargs: Any,
+ ) -> None:
+ kernel_size = _pair(kernel_size)
+ stride = _pair(stride)
+ padding = _pair(padding)
+ if norm_layer is None:
+ norm_layer = Identity
+ if activation_layer is None:
+ activation_layer = Identity
+ self.kernel_size = kernel_size
+ self.stride = stride
+ dict_layers = (nn.Conv2D(in_planes,
+ out_planes,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ groups=groups,
+ **kwargs), norm_layer(out_planes,
+ momentum=0.1),
+ activation_layer())
+
+ self.out_channels = out_planes
+ super(Conv2dBNActivation, self).__init__(dict_layers[0], dict_layers[1],
+ dict_layers[2])
+
+
+class Conv3DBNActivation(nn.Sequential):
+ def __init__(
+ self,
+ in_planes: int,
+ out_planes: int,
+ kernel_size: Union[int, Tuple[int, int, int]],
+ padding: Union[int, Tuple[int, int, int]],
+ stride: Union[int, Tuple[int, int, int]] = 1,
+ groups: int = 1,
+ norm_layer: Optional[Callable[..., nn.Layer]] = None,
+ activation_layer: Optional[Callable[..., nn.Layer]] = None,
+ **kwargs: Any,
+ ) -> None:
+ kernel_size = _triple(kernel_size)
+ stride = _triple(stride)
+ padding = _triple(padding)
+ if norm_layer is None:
+ norm_layer = Identity
+ if activation_layer is None:
+ activation_layer = Identity
+ self.kernel_size = kernel_size
+ self.stride = stride
+
+ dict_layers = (nn.Conv3D(in_planes,
+ out_planes,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ groups=groups,
+ **kwargs), norm_layer(out_planes,
+ momentum=0.1),
+ activation_layer())
+ self.out_channels = out_planes
+ super(Conv3DBNActivation, self).__init__(dict_layers[0], dict_layers[1],
+ dict_layers[2])
+
+
+class ConvBlock3D(CausalModule):
+ def __init__(
+ self,
+ in_planes: int,
+ out_planes: int,
+ kernel_size: Union[int, Tuple[int, int, int]],
+ causal: bool,
+ conv_type: str,
+ padding: Union[int, Tuple[int, int, int]] = 0,
+ stride: Union[int, Tuple[int, int, int]] = 1,
+ norm_layer: Optional[Callable[..., nn.Layer]] = None,
+ activation_layer: Optional[Callable[..., nn.Layer]] = None,
+ bias_attr: bool = False,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__()
+ kernel_size = _triple(kernel_size)
+ stride = _triple(stride)
+ padding = _triple(padding)
+ self.conv_2 = None
+
+ if causal is True:
+ padding = (0, padding[1], padding[2])
+ if conv_type != "2plus1d" and conv_type != "3d":
+ raise ValueError("only 2plus2d or 3d are " +
+ "allowed as 3d convolutions")
+
+ if conv_type == "2plus1d":
+ self.conv_1 = Conv2dBNActivation(in_planes,
+ out_planes,
+ kernel_size=(kernel_size[1],
+ kernel_size[2]),
+ padding=(padding[1], padding[2]),
+ stride=(stride[1], stride[2]),
+ activation_layer=activation_layer,
+ norm_layer=norm_layer,
+ bias_attr=bias_attr,
+ **kwargs)
+ if kernel_size[0] > 1:
+ self.conv_2 = Conv2dBNActivation(
+ in_planes,
+ out_planes,
+ kernel_size=(kernel_size[0], 1),
+ padding=(padding[0], 0),
+ stride=(stride[0], 1),
+ activation_layer=activation_layer,
+ norm_layer=norm_layer,
+ bias_attr=bias_attr,
+ **kwargs)
+ elif conv_type == "3d":
+ self.conv_1 = Conv3DBNActivation(in_planes,
+ out_planes,
+ kernel_size=kernel_size,
+ padding=padding,
+ activation_layer=activation_layer,
+ norm_layer=norm_layer,
+ stride=stride,
+ bias_attr=bias_attr,
+ **kwargs)
+ self.padding = padding
+ self.kernel_size = kernel_size
+ self.dim_pad = self.kernel_size[0] - 1
+ self.stride = stride
+ self.causal = causal
+ self.conv_type = conv_type
+
+ def _forward(self, x: paddle.Tensor) -> paddle.Tensor:
+ if self.dim_pad > 0 and self.conv_2 is None and self.causal is True:
+ x = self._cat_stream_buffer(x)
+ b, c, t, h, w = x.shape
+ if self.conv_type == "2plus1d":
+ x = paddle.transpose(x, (0, 2, 1, 3, 4)) # bcthw --> btchw
+ x = paddle.reshape_(x, (-1, c, h, w)) # btchw --> bt,c,h,w
+ x = self.conv_1(x)
+ if self.conv_type == "2plus1d":
+ b, c, h, w = x.shape
+ x = paddle.reshape_(x, (-1, t, c, h, w)) # bt,c,h,w --> b,t,c,h,w
+ x = paddle.transpose(x, (0, 2, 1, 3, 4)) # b,t,c,h,w --> b,c,t,h,w
+ if self.conv_2 is not None:
+ if self.dim_pad > 0 and self.causal is True:
+ x = self._cat_stream_buffer(x)
+ b, c, t, h, w = x.shape
+ x = paddle.reshape_(x, (b, c, t, h * w))
+ x = self.conv_2(x)
+ b, c, t, _ = x.shape
+ x = paddle.reshape_(x, (b, c, t, h, w))
+ return x
+
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+ x = self._forward(x)
+ return x
+
+ def _cat_stream_buffer(self, x: paddle.Tensor) -> paddle.Tensor:
+ if self.activation is None:
+ self._setup_activation(x.shape)
+ x = paddle.concat((self.activation, x), 2)
+ self._save_in_activation(x)
+ return x
+
+ def _save_in_activation(self, x: paddle.Tensor) -> None:
+ assert self.dim_pad > 0
+ self.activation = paddle.to_tensor(x.numpy()[:, :, -self.dim_pad:,
+ ...]).clone().detach()
+
+ def _setup_activation(self, input_shape: Tuple[float, ...]) -> None:
+ assert self.dim_pad > 0
+ self.activation = paddle.zeros(shape=[
+ *input_shape[:2], # type: ignore
+ self.dim_pad,
+ *input_shape[3:]
+ ])
+
+
+class TemporalCGAvgPool3D(CausalModule):
+ def __init__(self, ) -> None:
+ super().__init__()
+ self.n_cumulated_values = 0
+ self.register_forward_post_hook(self._detach_activation)
+
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+ input_shape = x.shape
+ cumulative_sum = paddle.cumsum(x, axis=2)
+ if self.activation is None:
+ self.activation = cumulative_sum[:, :, -1:].clone()
+ else:
+ cumulative_sum += self.activation
+ self.activation = cumulative_sum[:, :, -1:].clone()
+
+ noe = paddle.arange(1, input_shape[2] + 1)
+ axis = paddle.to_tensor([0, 1, 3, 4])
+ noe = paddle.unsqueeze(noe, axis=axis)
+ divisor = noe.expand(x.shape)
+ x = cumulative_sum / (self.n_cumulated_values + divisor)
+ self.n_cumulated_values += input_shape[2]
+ return x
+
+ @staticmethod
+ def _detach_activation(module: CausalModule, inputs: paddle.Tensor,
+ output: paddle.Tensor) -> None:
+ module.activation.detach()
+
+ def reset_activation(self) -> None:
+ super().reset_activation()
+ self.n_cumulated_values = 0
+
+
+class SqueezeExcitation(nn.Layer):
+ def __init__(self,
+ input_channels: int,
+ activation_2: nn.Layer,
+ activation_1: nn.Layer,
+ conv_type: str,
+ causal: bool,
+ squeeze_factor: int = 4,
+ bias_attr: bool = True) -> None:
+ super().__init__()
+ self.causal = causal
+ se_multiplier = 2 if causal else 1
+ squeeze_channels = _make_divisible(
+ input_channels // squeeze_factor * se_multiplier, 8)
+ self.temporal_cumualtive_GAvg3D = TemporalCGAvgPool3D()
+ self.fc1 = ConvBlock3D(input_channels * se_multiplier,
+ squeeze_channels,
+ kernel_size=(1, 1, 1),
+ padding=0,
+ causal=causal,
+ conv_type=conv_type,
+ bias_attr=bias_attr)
+ self.activation_1 = activation_1()
+ self.activation_2 = activation_2()
+ self.fc2 = ConvBlock3D(squeeze_channels,
+ input_channels,
+ kernel_size=(1, 1, 1),
+ padding=0,
+ causal=causal,
+ conv_type=conv_type,
+ bias_attr=bias_attr)
+
+ def _scale(self, inputs: paddle.Tensor) -> paddle.Tensor:
+ if self.causal:
+ x_space = paddle.mean(inputs, axis=[3, 4], keepdim=True)
+ scale = self.temporal_cumualtive_GAvg3D(x_space)
+ scale = paddle.concat((scale, x_space), axis=1)
+ else:
+ scale = F.adaptive_avg_pool3d(inputs, 1)
+ scale = self.fc1(scale)
+ scale = self.activation_1(scale)
+ scale = self.fc2(scale)
+ return self.activation_2(scale)
+
+ def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+ scale = self._scale(inputs)
+ return scale * inputs
+
+
+class BasicBneck(nn.Layer):
+ def __init__(
+ self,
+ input_channels,
+ out_channels,
+ expanded_channels,
+ kernel_size,
+ stride,
+ padding,
+ padding_avg,
+ causal: bool,
+ conv_type: str,
+ norm_layer: Optional[Callable[..., nn.Layer]] = None,
+ activation_layer: Optional[Callable[..., nn.Layer]] = None,
+ ) -> None:
+ super().__init__()
+
+ assert type(stride) is tuple
+
+ if (not stride[0] == 1 or not (1 <= stride[1] <= 2)
+ or not (1 <= stride[2] <= 2)):
+ raise ValueError('illegal stride value')
+
+ self.res = None
+
+ layers = []
+ if expanded_channels != out_channels:
+ # expand
+ self.expand = ConvBlock3D(in_planes=input_channels,
+ out_planes=expanded_channels,
+ kernel_size=(1, 1, 1),
+ padding=(0, 0, 0),
+ causal=causal,
+ conv_type=conv_type,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer)
+ # deepwise
+ self.deep = ConvBlock3D(in_planes=expanded_channels,
+ out_planes=expanded_channels,
+ kernel_size=kernel_size,
+ padding=padding,
+ stride=stride,
+ groups=expanded_channels,
+ causal=causal,
+ conv_type=conv_type,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer)
+
+ # SE
+ self.se = SqueezeExcitation(
+ expanded_channels,
+ causal=causal,
+ activation_1=activation_layer,
+ activation_2=(nn.Sigmoid if conv_type == "3d" else nn.Hardsigmoid),
+ conv_type=conv_type)
+ # project
+ self.project = ConvBlock3D(expanded_channels,
+ out_channels,
+ kernel_size=(1, 1, 1),
+ padding=(0, 0, 0),
+ causal=causal,
+ conv_type=conv_type,
+ norm_layer=norm_layer,
+ activation_layer=Identity)
+
+ if not (stride == (1, 1, 1) and input_channels == out_channels):
+ if stride != (1, 1, 1):
+ layers.append(
+ nn.AvgPool3D((1, 3, 3), stride=stride, padding=padding_avg))
+ layers.append(
+ ConvBlock3D(
+ in_planes=input_channels,
+ out_planes=out_channels,
+ kernel_size=(1, 1, 1),
+ padding=(0, 0, 0),
+ norm_layer=norm_layer,
+ activation_layer=Identity,
+ causal=causal,
+ conv_type=conv_type,
+ ))
+ self.res = nn.Sequential(*layers)
+ self.alpha = self.create_parameter(shape=[1], dtype="float32")
+
+ def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+ if self.res is not None:
+ residual = self.res(inputs)
+ else:
+ residual = inputs
+ if self.expand is not None:
+ x = self.expand(inputs)
+ else:
+ x = inputs
+
+ x = self.deep(x)
+ x = self.se(x)
+ x = self.project(x)
+ result = residual + self.alpha * x
+ return result
+
+
+@BACKBONES.register()
+class MoViNet(nn.Layer):
+ def __init__(
+ self,
+ model_type: str = 'A0',
+ hidden_dim: int = 2048,
+ causal: bool = True,
+ num_classes: int = 400,
+ conv_type: str = "3d",
+ ) -> None:
+ super().__init__()
+ """
+ causal: causal mode
+ num_classes: number of classes for classifcation
+ conv_type: type of convolution either 3d or 2plus1d
+ """
+ blocks_dic = OrderedDict()
+ cfg = MODEL_CONFIG[model_type]
+
+ norm_layer = nn.BatchNorm3D if conv_type == "3d" else nn.BatchNorm2D
+ activation_layer = nn.Swish if conv_type == "3d" else nn.Hardswish
+
+ # conv1
+ self.conv1 = ConvBlock3D(in_planes=cfg['conv1'][0],
+ out_planes=cfg['conv1'][1],
+ kernel_size=cfg['conv1'][2],
+ stride=cfg['conv1'][3],
+ padding=cfg['conv1'][4],
+ causal=causal,
+ conv_type=conv_type,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer)
+ # blocks
+ for i in range(2, len(cfg['block_num']) + 1):
+ for j in range(cfg['block_num'][i - 1]):
+ blocks_dic[f'b{i}_l{j}'] = BasicBneck(
+ cfg[f'b{i}_l{j}'][0],
+ cfg[f'b{i}_l{j}'][1],
+ cfg[f'b{i}_l{j}'][2],
+ cfg[f'b{i}_l{j}'][3],
+ cfg[f'b{i}_l{j}'][4],
+ cfg[f'b{i}_l{j}'][5],
+ cfg[f'b{i}_l{j}'][6],
+ causal=causal,
+ conv_type=conv_type,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer)
+ self.blocks = nn.Sequential(*(blocks_dic.values()))
+
+ # conv7
+ self.conv7 = ConvBlock3D(in_planes=cfg['conv7'][0],
+ out_planes=cfg['conv7'][1],
+ kernel_size=cfg['conv7'][2],
+ stride=cfg['conv7'][3],
+ padding=cfg['conv7'][4],
+ causal=causal,
+ conv_type=conv_type,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer)
+ # pool
+ self.classifier = nn.Sequential(
+ # dense9
+ ConvBlock3D(in_planes=cfg['conv7'][1],
+ out_planes=hidden_dim,
+ kernel_size=(1, 1, 1),
+ causal=causal,
+ conv_type=conv_type,
+ bias_attr=True),
+ nn.Swish(),
+ nn.Dropout(p=0.2),
+ # dense10d
+ ConvBlock3D(in_planes=hidden_dim,
+ out_planes=num_classes,
+ kernel_size=(1, 1, 1),
+ causal=causal,
+ conv_type=conv_type,
+ bias_attr=True),
+ )
+ if causal:
+ self.cgap = TemporalCGAvgPool3D()
+ self.apply(self._weight_init)
+ self.causal = causal
+
+ def avg(self, x: paddle.Tensor) -> paddle.Tensor:
+ if self.causal:
+ avg = F.adaptive_avg_pool3d(x, (x.shape[2], 1, 1))
+ avg = self.cgap(avg)[:, :, -1:]
+ else:
+ avg = F.adaptive_avg_pool3d(x, 1)
+ return avg
+
+ @staticmethod
+ def _weight_init(m):
+ if isinstance(m, nn.Conv3D):
+ nn.initializer.KaimingNormal(m.weight)
+ if m.bias is not None:
+ nn.initializer.Constant(0.0)(m.bias)
+ elif isinstance(m, (nn.BatchNorm3D, nn.BatchNorm2D, nn.GroupNorm)):
+ nn.initializer.Constant(1.0)(m.weight)
+ nn.initializer.Constant(0.0)(m.bias)
+ elif isinstance(m, nn.Linear):
+ nn.initializer.Normal(m.weight, 0, 0.01)
+ nn.initializer.Constant(0.0)(m.bias)
+
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+ x = self.conv1(x)
+ x = self.blocks(x)
+ x = self.conv7(x)
+ x = self.avg(x)
+ x = self.classifier(x)
+ x = x.flatten(1)
+ return x
+
+ @staticmethod
+ def _clean_activation_buffers(m):
+ if issubclass(type(m), CausalModule):
+ m.reset_activation()
+
+ def clean_activation_buffers(self) -> None:
+ self.apply(self._clean_activation_buffers)
+
+
+if __name__ == '__main__':
+ net = MoViNet(causal=False, conv_type='3d')
+ paddle.summary(net, input_size=(1, 3, 8, 224, 224))
diff --git a/paddlevideo/modeling/backbones/ms_tcn.py b/paddlevideo/modeling/backbones/ms_tcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb49b9c808c1707616364a6912bb9ab289771adf
--- /dev/null
+++ b/paddlevideo/modeling/backbones/ms_tcn.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import copy
+import random
+import math
+
+from paddle import ParamAttr
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+ dimensions = len(tensor.shape)
+ if dimensions < 2:
+ raise ValueError("Fan in and fan out can not be computed \
+ for tensor with fewer than 2 dimensions")
+
+ if dimensions == 2: # Linear
+ fan_in = tensor.shape[1]
+ fan_out = tensor.shape[0]
+ else:
+ num_input_fmaps = tensor.shape[1]
+ num_output_fmaps = tensor.shape[0]
+ receptive_field_size = 1
+ if tensor.dim() > 2:
+ receptive_field_size = tensor[0][0].numel()
+ fan_in = num_input_fmaps * receptive_field_size
+ fan_out = num_output_fmaps * receptive_field_size
+
+ return fan_in, fan_out
+
+
+def calculate_gain(nonlinearity=None, a=None):
+ if nonlinearity == 'tanh':
+ return 5.0 / 3
+ elif nonlinearity == 'relu':
+ return math.sqrt(2.0)
+ elif nonlinearity == 'leaky_relu':
+ if a != None:
+ return math.sqrt(2.0 / (1 + a**2))
+ else:
+ return math.sqrt(2.0 / (1 + 0.01**2))
+ elif nonlinearity == 'selu':
+ return 3.0 / 4
+ else:
+ return 1
+
+
+def KaimingUniform_like_torch(weight_npy,
+ mode='fan_in',
+ nonlinearity='leaky_relu'):
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+ if mode == 'fan_in':
+ fan_mode = fan_in
+ else:
+ fan_mode = fan_out
+ a = math.sqrt(5.0)
+ gain = calculate_gain(nonlinearity=nonlinearity, a=a)
+ std = gain / math.sqrt(fan_mode)
+ bound = math.sqrt(3.0) * std
+ return np.random.uniform(-bound, bound, weight_npy.shape)
+
+
+def init_bias(weight_npy, bias_npy):
+ # attention this weight is not bias
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+ bound = 1.0 / math.sqrt(fan_in)
+ return np.random.uniform(-bound, bound, bias_npy.shape)
+
+
+class SingleStageModel(nn.Layer):
+
+ def __init__(self, num_layers, num_f_maps, dim, num_classes):
+ super(SingleStageModel, self).__init__()
+ self.conv_in = nn.Conv1D(dim, num_f_maps, 1)
+ self.layers = nn.LayerList([
+ copy.deepcopy(DilatedResidualLayer(2**i, num_f_maps, num_f_maps))
+ for i in range(num_layers)
+ ])
+ self.conv_out = nn.Conv1D(num_f_maps, num_classes, 1)
+
+ def forward(self, x):
+ out = self.conv_in(x)
+ for layer in self.layers:
+ out = layer(out)
+ out = self.conv_out(out)
+ return out
+
+
+class DilatedResidualLayer(nn.Layer):
+
+ def __init__(self, dilation, in_channels, out_channels):
+ super(DilatedResidualLayer, self).__init__()
+ self.conv_dilated = nn.Conv1D(in_channels,
+ out_channels,
+ 3,
+ padding=dilation,
+ dilation=dilation)
+ self.conv_in = nn.Conv1D(out_channels, out_channels, 1)
+ self.dropout = nn.Dropout()
+
+ def forward(self, x):
+ out = F.relu(self.conv_dilated(x))
+ out = self.conv_in(out)
+ out = self.dropout(out)
+ return (x + out)
+
+
+@BACKBONES.register()
+class MSTCN(nn.Layer):
+
+ def __init__(self, num_stages, num_layers, num_f_maps, dim, num_classes):
+ super().__init__()
+ self.stage1 = SingleStageModel(num_layers, num_f_maps, dim, num_classes)
+ self.stages = nn.LayerList([
+ copy.deepcopy(
+ SingleStageModel(num_layers, num_f_maps, num_classes,
+ num_classes)) for s in range(num_stages - 1)
+ ])
+
+ def forward(self, x):
+ """ MSTCN forward
+ """
+ out = self.stage1(x)
+ outputs = out.unsqueeze(0)
+ for s in self.stages:
+ out = s(F.softmax(out, axis=1))
+ outputs = paddle.concat((outputs, out.unsqueeze(0)), axis=0)
+ return outputs
+
+ def init_weights(self):
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv1D):
+ layer.weight.set_value(
+ KaimingUniform_like_torch(layer.weight).astype('float32'))
+ if layer.bias is not None:
+ layer.bias.set_value(
+ init_bias(layer.weight, layer.bias).astype('float32'))
diff --git a/paddlevideo/modeling/backbones/resnet.py b/paddlevideo/modeling/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f07991a2349e9cecc3b17d46bf3079a1d255695
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet.py
@@ -0,0 +1,283 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+ AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+ """Conv2D and BatchNorm2D layer.
+
+ Args:
+ in_channels (int): Number of channels for the input.
+ out_channels (int): Number of channels for the output.
+ kernel_size (int): Kernel size.
+ stride (int): Stride in the Conv2D layer. Default: 1.
+ groups (int): Groups in the Conv2D, Default: 1.
+ act (str): Indicate activation after BatchNorm2D layer.
+ name (str): the name of an instance of ConvBNLayer.
+
+ Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ groups=1,
+ act=None,
+ name=None):
+ super(ConvBNLayer, self).__init__()
+ self._conv = Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=(kernel_size - 1) // 2,
+ groups=groups,
+ weight_attr=ParamAttr(name=name + "_weights"),
+ bias_attr=False)
+ if name == "conv1":
+ bn_name = "bn_" + name
+ else:
+ bn_name = "bn" + name[3:]
+
+ self._act = act
+
+ self._batch_norm = BatchNorm2D(out_channels,
+ weight_attr=ParamAttr(name=bn_name +
+ "_scale"),
+ bias_attr=ParamAttr(bn_name + "_offset"))
+
+ def forward(self, inputs):
+ y = self._conv(inputs)
+ y = self._batch_norm(y)
+ if self._act:
+ y = getattr(paddle.nn.functional, self._act)(y)
+ return y
+
+
+class BottleneckBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ name=None):
+ super(BottleneckBlock, self).__init__()
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ act="relu",
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act="relu",
+ name=name + "_branch2b")
+
+ self.conv2 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ act=None,
+ name=name + "_branch2c")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ stride=stride,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+ conv2 = self.conv2(conv1)
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv2)
+ return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ name=None):
+ super(BasicBlock, self).__init__()
+ self.stride = stride
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ stride=stride,
+ act="relu",
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ act=None,
+ name=name + "_branch2b")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=1,
+ stride=stride,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(short, conv1)
+ y = F.relu(y)
+ return y
+
+
+@BACKBONES.register()
+class ResNet(nn.Layer):
+ """ResNet backbone.
+
+ Args:
+ depth (int): Depth of resnet model.
+ pretrained (str): pretrained model. Default: None.
+ """
+ def __init__(self, depth, pretrained=None):
+ super(ResNet, self).__init__()
+ self.pretrained = pretrained
+ self.layers = depth
+
+ supported_layers = [18, 34, 50, 101, 152]
+ assert self.layers in supported_layers, \
+ "supported layers are {} but input layer is {}".format(
+ supported_layers, self.layers)
+
+ if self.layers == 18:
+ depth = [2, 2, 2, 2]
+ elif self.layers == 34 or self.layers == 50:
+ depth = [3, 4, 6, 3]
+ elif self.layers == 101:
+ depth = [3, 4, 23, 3]
+ elif self.layers == 152:
+ depth = [3, 8, 36, 3]
+
+ in_channels = [64, 256, 512, 1024]
+ out_channels = [64, 128, 256, 512]
+
+ self.conv = ConvBNLayer(in_channels=3,
+ out_channels=64,
+ kernel_size=7,
+ stride=2,
+ act="relu",
+ name="conv1")
+ self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+ self.block_list = []
+ if self.layers >= 50:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ if self.layers in [101, 152] and block == 2:
+ if i == 0:
+ conv_name = "res" + str(block + 2) + "a"
+ else:
+ conv_name = "res" + str(block + 2) + "b" + str(i)
+ else:
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ bottleneck_block = self.add_sublayer(
+ conv_name,
+ BottleneckBlock(
+ # NOTE: Be careful! Here is different from TSM model.
+ in_channels=in_channels[block]
+ if i == 0 else out_channels[block] * 4,
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ name=conv_name))
+
+ self.block_list.append(bottleneck_block)
+ shortcut = True
+ else:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ basic_block = self.add_sublayer(
+ conv_name,
+ BasicBlock(in_channels=in_channels[block]
+ if i == 0 else out_channels[block],
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ name=conv_name))
+ self.block_list.append(basic_block)
+ shortcut = True
+
+ def init_weights(self):
+ """Initiate the parameters.
+ Note:
+ 1. when indicate pretrained loading path, will load it to initiate backbone.
+ 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+ Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+ """
+ #XXX: check bias!!! check pretrained!!!
+
+ if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ #XXX: no bias
+ weight_init_(layer, 'KaimingNormal')
+ elif isinstance(layer, nn.BatchNorm2D):
+ weight_init_(layer, 'Constant', value=1)
+
+ def forward(self, inputs):
+ """Define how the backbone is going to run.
+
+ """
+ #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+ # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+ #y = paddle.reshape(
+ # inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+ y = self.conv(inputs)
+ y = self.pool2D_max(y)
+ for block in self.block_list:
+ y = block(y)
+ return y
diff --git a/paddlevideo/modeling/backbones/resnet_slowfast.py b/paddlevideo/modeling/backbones/resnet_slowfast.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67915946b014172f39621bfdb0c62a65e0c8cd9
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet_slowfast.py
@@ -0,0 +1,795 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal
+from ..registry import BACKBONES
+from paddlevideo.utils.multigrid import get_norm
+import sys
+import numpy as np
+import paddle.distributed as dist
+
+# seed random seed
+paddle.framework.seed(0)
+
+
+# get init parameters for conv layer
+def get_conv_init(fan_out):
+ return KaimingNormal(fan_in=fan_out)
+
+
+def get_bn_param_attr(bn_weight=1.0, coeff=0.0):
+ param_attr = paddle.ParamAttr(
+ initializer=paddle.nn.initializer.Constant(bn_weight),
+ regularizer=paddle.regularizer.L2Decay(coeff))
+ return param_attr
+
+
+"""Video models."""
+
+
+class BottleneckTransform(paddle.nn.Layer):
+ """
+ Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+ temporal kernel.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1=False,
+ inplace_relu=True,
+ eps=1e-5,
+ dilation=1,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ Args:
+ dim_in (int): the channel dimensions of the input.
+ dim_out (int): the channel dimension of the output.
+ temp_kernel_size (int): the temporal kernel sizes of the middle
+ convolution in the bottleneck.
+ stride (int): the stride of the bottleneck.
+ dim_inner (int): the inner dimension of the block.
+ num_groups (int): number of groups for the convolution. num_groups=1
+ is for standard ResNet like networks, and num_groups>1 is for
+ ResNeXt like networks.
+ stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+ apply stride to the 3x3 conv.
+ inplace_relu (bool): if True, calculate the relu on the original
+ input without allocating new memory.
+ eps (float): epsilon for batch norm.
+ dilation (int): size of dilation.
+ """
+ super(BottleneckTransform, self).__init__()
+ self.temp_kernel_size = temp_kernel_size
+ self._inplace_relu = inplace_relu
+ self._eps = eps
+ self._stride_1x1 = stride_1x1
+ self.norm_module = norm_module
+ self._construct(dim_in, dim_out, stride, dim_inner, num_groups,
+ dilation)
+
+ def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,
+ dilation):
+ str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)
+
+ fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+
+ self.a = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_inner,
+ kernel_size=[self.temp_kernel_size, 1, 1],
+ stride=[1, str1x1, str1x1],
+ padding=[int(self.temp_kernel_size // 2), 0, 0],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self.a_bn = self.norm_module(num_features=dim_inner,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ # 1x3x3, BN, ReLU.
+ fan = (dim_inner) * (1 * 3 * 3)
+ initializer_tmp = get_conv_init(fan)
+
+ self.b = paddle.nn.Conv3D(
+ in_channels=dim_inner,
+ out_channels=dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, str3x3, str3x3],
+ padding=[0, dilation, dilation],
+ groups=num_groups,
+ dilation=[1, dilation, dilation],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self.b_bn = self.norm_module(num_features=dim_inner,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ # 1x1x1, BN.
+ fan = (dim_out) * (1 * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+
+ self.c = paddle.nn.Conv3D(
+ in_channels=dim_inner,
+ out_channels=dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self.c_bn = self.norm_module(
+ num_features=dim_out,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(bn_weight=0.0),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ def forward(self, x):
+ # Branch2a.
+ x = self.a(x)
+ x = self.a_bn(x)
+ x = F.relu(x)
+
+ # Branch2b.
+ x = self.b(x)
+ x = self.b_bn(x)
+ x = F.relu(x)
+
+ # Branch2c
+ x = self.c(x)
+ x = self.c_bn(x)
+ return x
+
+
+class ResBlock(paddle.nn.Layer):
+ """
+ Residual block.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups=1,
+ stride_1x1=False,
+ inplace_relu=True,
+ eps=1e-5,
+ dilation=1,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ ResBlock class constructs redisual blocks. More details can be found in:
+ Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+ "Deep residual learning for image recognition."
+ https://arxiv.org/abs/1512.03385
+ Args:
+ dim_in (int): the channel dimensions of the input.
+ dim_out (int): the channel dimension of the output.
+ temp_kernel_size (int): the temporal kernel sizes of the middle
+ convolution in the bottleneck.
+ stride (int): the stride of the bottleneck.
+ trans_func (string): transform function to be used to construct the
+ bottleneck.
+ dim_inner (int): the inner dimension of the block.
+ num_groups (int): number of groups for the convolution. num_groups=1
+ is for standard ResNet like networks, and num_groups>1 is for
+ ResNeXt like networks.
+ stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+ apply stride to the 3x3 conv.
+ inplace_relu (bool): calculate the relu on the original input
+ without allocating new memory.
+ eps (float): epsilon for batch norm.
+ dilation (int): size of dilation.
+ """
+ super(ResBlock, self).__init__()
+ self._inplace_relu = inplace_relu
+ self._eps = eps
+ self.norm_module = norm_module
+ self._construct(
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ )
+
+ def _construct(
+ self,
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ ):
+ # Use skip connection with projection if dim or res change.
+ if (dim_in != dim_out) or (stride != 1):
+ fan = (dim_out) * (1 * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+ self.branch1 = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_out,
+ kernel_size=1,
+ stride=[1, stride, stride],
+ padding=0,
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False,
+ dilation=1)
+ self.branch1_bn = self.norm_module(
+ num_features=dim_out,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ self.branch2 = BottleneckTransform(dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1=stride_1x1,
+ inplace_relu=inplace_relu,
+ dilation=dilation,
+ norm_module=self.norm_module)
+
+ def forward(self, x):
+ if hasattr(self, "branch1"):
+ x1 = self.branch1(x)
+ x1 = self.branch1_bn(x1)
+ x2 = self.branch2(x)
+ x = paddle.add(x=x1, y=x2)
+ else:
+ x2 = self.branch2(x)
+ x = paddle.add(x=x, y=x2)
+
+ x = F.relu(x)
+ return x
+
+
+class ResStage(paddle.nn.Layer):
+ """
+ Stage of 3D ResNet. It expects to have one or more tensors as input for
+ multi-pathway (SlowFast) cases. More details can be found here:
+
+ Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+ "Slowfast networks for video recognition."
+ https://arxiv.org/pdf/1812.03982.pdf
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ stride,
+ temp_kernel_sizes,
+ num_blocks,
+ dim_inner,
+ num_groups,
+ num_block_temp_kernel,
+ dilation,
+ stride_1x1=False,
+ inplace_relu=True,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ The `__init__` method of any subclass should also contain these arguments.
+ ResStage builds p streams, where p can be greater or equal to one.
+ Args:
+ dim_in (list): list of p the channel dimensions of the input.
+ Different channel dimensions control the input dimension of
+ different pathways.
+ dim_out (list): list of p the channel dimensions of the output.
+ Different channel dimensions control the input dimension of
+ different pathways.
+ temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+ convolution in the bottleneck. Different temp_kernel_sizes
+ control different pathway.
+ stride (list): list of the p strides of the bottleneck. Different
+ stride control different pathway.
+ num_blocks (list): list of p numbers of blocks for each of the
+ pathway.
+ dim_inner (list): list of the p inner channel dimensions of the
+ input. Different channel dimensions control the input dimension
+ of different pathways.
+ num_groups (list): list of number of p groups for the convolution.
+ num_groups=1 is for standard ResNet like networks, and
+ num_groups>1 is for ResNeXt like networks.
+ num_block_temp_kernel (list): extent the temp_kernel_sizes to
+ num_block_temp_kernel blocks, then fill temporal kernel size
+ of 1 for the rest of the layers.
+ dilation (list): size of dilation for each pathway.
+ """
+ super(ResStage, self).__init__()
+ assert all((num_block_temp_kernel[i] <= num_blocks[i]
+ for i in range(len(temp_kernel_sizes))))
+ self.num_blocks = num_blocks
+ self.temp_kernel_sizes = [
+ (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +
+ [1] * (num_blocks[i] - num_block_temp_kernel[i])
+ for i in range(len(temp_kernel_sizes))
+ ]
+ assert (len({
+ len(dim_in),
+ len(dim_out),
+ len(temp_kernel_sizes),
+ len(stride),
+ len(num_blocks),
+ len(dim_inner),
+ len(num_groups),
+ len(num_block_temp_kernel),
+ }) == 1)
+ self.num_pathways = len(self.num_blocks)
+ self.norm_module = norm_module
+ self._construct(
+ dim_in,
+ dim_out,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ )
+
+ def _construct(
+ self,
+ dim_in,
+ dim_out,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ ):
+
+ for pathway in range(self.num_pathways):
+ for i in range(self.num_blocks[pathway]):
+ res_block = ResBlock(
+ dim_in[pathway] if i == 0 else dim_out[pathway],
+ dim_out[pathway],
+ self.temp_kernel_sizes[pathway][i],
+ stride[pathway] if i == 0 else 1,
+ dim_inner[pathway],
+ num_groups[pathway],
+ stride_1x1=stride_1x1,
+ inplace_relu=inplace_relu,
+ dilation=dilation[pathway],
+ norm_module=self.norm_module)
+ self.add_sublayer("pathway{}_res{}".format(pathway, i),
+ res_block)
+
+ def forward(self, inputs):
+ output = []
+ for pathway in range(self.num_pathways):
+ x = inputs[pathway]
+
+ for i in range(self.num_blocks[pathway]):
+ m = getattr(self, "pathway{}_res{}".format(pathway, i))
+ x = m(x)
+ output.append(x)
+
+ return output
+
+
+class ResNetBasicStem(paddle.nn.Layer):
+ """
+ ResNe(X)t 3D stem module.
+ Performs spatiotemporal Convolution, BN, and Relu following by a
+ spatiotemporal pooling.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ kernel,
+ stride,
+ padding,
+ eps=1e-5,
+ norm_module=paddle.nn.BatchNorm3D):
+ super(ResNetBasicStem, self).__init__()
+ self.kernel = kernel
+ self.stride = stride
+ self.padding = padding
+ self.eps = eps
+ self.norm_module = norm_module
+ self._construct_stem(dim_in, dim_out)
+
+ def _construct_stem(self, dim_in, dim_out):
+ fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])
+ initializer_tmp = get_conv_init(fan)
+
+ self._conv = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_out,
+ kernel_size=self.kernel,
+ stride=self.stride,
+ padding=self.padding,
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self._bn = self.norm_module(num_features=dim_out,
+ epsilon=self.eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ def forward(self, x):
+ x = self._conv(x)
+ x = self._bn(x)
+ x = F.relu(x)
+
+ x = F.max_pool3d(x=x,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ data_format="NCDHW")
+ return x
+
+
+class VideoModelStem(paddle.nn.Layer):
+ """
+ Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+ on input data tensor for slow and fast pathways.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ kernel,
+ stride,
+ padding,
+ eps=1e-5,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ Args:
+ dim_in (list): the list of channel dimensions of the inputs.
+ dim_out (list): the output dimension of the convolution in the stem
+ layer.
+ kernel (list): the kernels' size of the convolutions in the stem
+ layers. Temporal kernel size, height kernel size, width kernel
+ size in order.
+ stride (list): the stride sizes of the convolutions in the stem
+ layer. Temporal kernel stride, height kernel size, width kernel
+ size in order.
+ padding (list): the paddings' sizes of the convolutions in the stem
+ layer. Temporal padding size, height padding size, width padding
+ size in order.
+ eps (float): epsilon for batch norm.
+ """
+ super(VideoModelStem, self).__init__()
+
+ assert (len({
+ len(dim_in),
+ len(dim_out),
+ len(kernel),
+ len(stride),
+ len(padding),
+ }) == 1), "Input pathway dimensions are not consistent."
+ self.num_pathways = len(dim_in)
+ self.kernel = kernel
+ self.stride = stride
+ self.padding = padding
+ self.eps = eps
+ self.norm_module = norm_module
+ self._construct_stem(dim_in, dim_out)
+
+ def _construct_stem(self, dim_in, dim_out):
+ for pathway in range(len(dim_in)):
+ stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],
+ self.kernel[pathway], self.stride[pathway],
+ self.padding[pathway], self.eps,
+ self.norm_module)
+ self.add_sublayer("pathway{}_stem".format(pathway), stem)
+
+ def forward(self, x):
+ assert (len(x) == self.num_pathways
+ ), "Input tensor does not contain {} pathway".format(
+ self.num_pathways)
+
+ for pathway in range(len(x)):
+ m = getattr(self, "pathway{}_stem".format(pathway))
+ x[pathway] = m(x[pathway])
+
+ return x
+
+
+class FuseFastToSlow(paddle.nn.Layer):
+ """
+ Fuses the information from the Fast pathway to the Slow pathway. Given the
+ tensors from Slow pathway and Fast pathway, fuse information from Fast to
+ Slow, then return the fused tensors from Slow and Fast pathway in order.
+ """
+ def __init__(self,
+ dim_in,
+ fusion_conv_channel_ratio,
+ fusion_kernel,
+ alpha,
+ fuse_bn_relu=1,
+ eps=1e-5,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ Args:
+ dim_in (int): the channel dimension of the input.
+ fusion_conv_channel_ratio (int): channel ratio for the convolution
+ used to fuse from Fast pathway to Slow pathway.
+ fusion_kernel (int): kernel size of the convolution used to fuse
+ from Fast pathway to Slow pathway.
+ alpha (int): the frame rate ratio between the Fast and Slow pathway.
+ eps (float): epsilon for batch norm.
+ """
+ super(FuseFastToSlow, self).__init__()
+ self.fuse_bn_relu = fuse_bn_relu
+ fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+
+ self._conv_f2s = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_in * fusion_conv_channel_ratio,
+ kernel_size=[fusion_kernel, 1, 1],
+ stride=[alpha, 1, 1],
+ padding=[fusion_kernel // 2, 0, 0],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
+ epsilon=eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ def forward(self, x):
+ x_s = x[0]
+ x_f = x[1]
+ fuse = self._conv_f2s(x_f)
+ # TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.
+ if self.fuse_bn_relu:
+ fuse = self._bn(fuse)
+ fuse = F.relu(fuse)
+ x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
+
+ return [x_s_fuse, x_f]
+
+
+@BACKBONES.register()
+class ResNetSlowFast(paddle.nn.Layer):
+ """
+ SlowFast model builder for SlowFast network.
+
+ Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+ "Slowfast networks for video recognition."
+ https://arxiv.org/pdf/1812.03982.pdf
+ """
+ def __init__(
+ self,
+ alpha,
+ beta,
+ bn_norm_type="batchnorm",
+ bn_num_splits=1,
+ num_pathways=2,
+ depth=50,
+ num_groups=1,
+ input_channel_num=[3, 3],
+ width_per_group=64,
+ fusion_conv_channel_ratio=2,
+ fusion_kernel_sz=7, #5?
+ pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+ fuse_bn_relu = 1,
+ spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]],
+ use_pool_af_s2 = 1,
+ ):
+ """
+ Args:
+ cfg (CfgNode): model building configs, details are in the
+ comments of the config file.
+ """
+ super(ResNetSlowFast, self).__init__()
+
+ self.alpha = alpha #8
+ self.beta = beta #8
+ self.norm_module = get_norm(bn_norm_type, bn_num_splits)
+ self.num_pathways = num_pathways
+ self.depth = depth
+ self.num_groups = num_groups
+ self.input_channel_num = input_channel_num
+ self.width_per_group = width_per_group
+ self.fusion_conv_channel_ratio = fusion_conv_channel_ratio
+ self.fusion_kernel_sz = fusion_kernel_sz # NOTE: modify to 7 in 8*8, 5 in old implement
+ self.pool_size_ratio = pool_size_ratio
+ self.fuse_bn_relu = fuse_bn_relu
+ self.spatial_strides = spatial_strides
+ self.use_pool_af_s2 = use_pool_af_s2
+ self._construct_network()
+
+ def _construct_network(self):
+ """
+ Builds a SlowFast model.
+ The first pathway is the Slow pathway
+ and the second pathway is the Fast pathway.
+
+ Args:
+ cfg (CfgNode): model building configs, details are in the
+ comments of the config file.
+ """
+ temp_kernel = [
+ [[1], [5]], # conv1 temporal kernel for slow and fast pathway.
+ [[1], [3]], # res2 temporal kernel for slow and fast pathway.
+ [[1], [3]], # res3 temporal kernel for slow and fast pathway.
+ [[3], [3]], # res4 temporal kernel for slow and fast pathway.
+ [[3], [3]],
+ ] # res5 temporal kernel for slow and fast pathway.
+
+ self.s1 = VideoModelStem(
+ dim_in=self.input_channel_num,
+ dim_out=[self.width_per_group, self.width_per_group // self.beta],
+ kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],
+ stride=[[1, 2, 2]] * 2,
+ padding=[
+ [temp_kernel[0][0][0] // 2, 3, 3],
+ [temp_kernel[0][1][0] // 2, 3, 3],
+ ],
+ norm_module=self.norm_module)
+ self.s1_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu)
+
+ # ResNet backbone
+ MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}
+ (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]
+
+ num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]
+ spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]
+ spatial_strides = self.spatial_strides
+ #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]
+ #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment
+
+ out_dim_ratio = self.beta // self.fusion_conv_channel_ratio #4
+ dim_inner = self.width_per_group * self.num_groups #64
+
+ self.s2 = ResStage(dim_in=[
+ self.width_per_group + self.width_per_group // out_dim_ratio,
+ self.width_per_group // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 4,
+ self.width_per_group * 4 // self.beta,
+ ],
+ dim_inner=[dim_inner, dim_inner // self.beta],
+ temp_kernel_sizes=temp_kernel[1],
+ stride=spatial_strides[0],
+ num_blocks=[d2] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[0],
+ dilation=spatial_dilations[0],
+ norm_module=self.norm_module)
+
+ self.s2_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group * 4 // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu,
+ )
+
+ self.s3 = ResStage(
+ dim_in=[
+ self.width_per_group * 4 +
+ self.width_per_group * 4 // out_dim_ratio,
+ self.width_per_group * 4 // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 8,
+ self.width_per_group * 8 // self.beta,
+ ],
+ dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],
+ temp_kernel_sizes=temp_kernel[2],
+ stride=spatial_strides[1],
+ num_blocks=[d3] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[1],
+ dilation=spatial_dilations[1],
+ norm_module=self.norm_module,
+ )
+
+ self.s3_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group * 8 // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu,
+ )
+
+ self.s4 = ResStage(
+ dim_in=[
+ self.width_per_group * 8 +
+ self.width_per_group * 8 // out_dim_ratio,
+ self.width_per_group * 8 // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 16,
+ self.width_per_group * 16 // self.beta,
+ ],
+ dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],
+ temp_kernel_sizes=temp_kernel[3],
+ stride=spatial_strides[2],
+ num_blocks=[d4] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[2],
+ dilation=spatial_dilations[2],
+ norm_module=self.norm_module,
+ )
+
+ self.s4_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group * 16 // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu,
+ )
+
+ self.s5 = ResStage(
+ dim_in=[
+ self.width_per_group * 16 +
+ self.width_per_group * 16 // out_dim_ratio,
+ self.width_per_group * 16 // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 32,
+ self.width_per_group * 32 // self.beta,
+ ],
+ dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],
+ temp_kernel_sizes=temp_kernel[4],
+ stride=spatial_strides[3],
+ num_blocks=[d5] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[3],
+ dilation=spatial_dilations[3],
+ norm_module=self.norm_module,
+ )
+
+ def init_weights(self):
+ pass
+
+ def forward(self, x):
+ x = self.s1(x) #VideoModelStem
+ x = self.s1_fuse(x) #FuseFastToSlow
+ x = self.s2(x) #ResStage
+ x = self.s2_fuse(x)
+
+ # TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.
+ if self.use_pool_af_s2:
+ for pathway in range(self.num_pathways):
+ x[pathway] = F.max_pool3d(x=x[pathway],
+ kernel_size=self.pool_size_ratio[pathway],
+ stride=self.pool_size_ratio[pathway],
+ padding=[0, 0, 0],
+ data_format="NCDHW")
+
+ x = self.s3(x)
+ x = self.s3_fuse(x)
+ x = self.s4(x)
+ x = self.s4_fuse(x)
+ x = self.s5(x)
+ return x
diff --git a/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py b/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..d348d45cf20186c9d89e666d0dcc5ac93cf66363
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py
@@ -0,0 +1,796 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal
+from ..registry import BACKBONES
+from paddlevideo.utils.multigrid import get_norm
+import sys
+import numpy as np
+import paddle.distributed as dist
+
+# seed random seed
+paddle.framework.seed(0)
+
+
+# get init parameters for conv layer
+def get_conv_init(fan_out):
+ return KaimingNormal(fan_in=fan_out)
+
+
+def get_bn_param_attr(bn_weight=1.0, coeff=0.0):
+ param_attr = paddle.ParamAttr(
+ initializer=paddle.nn.initializer.Constant(bn_weight),
+ regularizer=paddle.regularizer.L2Decay(coeff))
+ return param_attr
+
+
+"""Video models."""
+
+
+class BottleneckTransform(paddle.nn.Layer):
+ """
+ Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+ temporal kernel.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1=False,
+ inplace_relu=True,
+ eps=1e-5,
+ dilation=1,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ Args:
+ dim_in (int): the channel dimensions of the input.
+ dim_out (int): the channel dimension of the output.
+ temp_kernel_size (int): the temporal kernel sizes of the middle
+ convolution in the bottleneck.
+ stride (int): the stride of the bottleneck.
+ dim_inner (int): the inner dimension of the block.
+ num_groups (int): number of groups for the convolution. num_groups=1
+ is for standard ResNet like networks, and num_groups>1 is for
+ ResNeXt like networks.
+ stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+ apply stride to the 3x3 conv.
+ inplace_relu (bool): if True, calculate the relu on the original
+ input without allocating new memory.
+ eps (float): epsilon for batch norm.
+ dilation (int): size of dilation.
+ """
+ super(BottleneckTransform, self).__init__()
+ self.temp_kernel_size = temp_kernel_size
+ self._inplace_relu = inplace_relu
+ self._eps = eps
+ self._stride_1x1 = stride_1x1
+ self.norm_module = norm_module
+ self._construct(dim_in, dim_out, stride, dim_inner, num_groups,
+ dilation)
+
+ def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,
+ dilation):
+ str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)
+
+ fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+
+ self.a = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_inner,
+ kernel_size=[self.temp_kernel_size, 1, 1],
+ stride=[1, str1x1, str1x1],
+ padding=[int(self.temp_kernel_size // 2), 0, 0],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self.a_bn = self.norm_module(num_features=dim_inner,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ # 1x3x3, BN, ReLU.
+ fan = (dim_inner) * (1 * 3 * 3)
+ initializer_tmp = get_conv_init(fan)
+
+ self.b = paddle.nn.Conv3D(
+ in_channels=dim_inner,
+ out_channels=dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, str3x3, str3x3],
+ padding=[0, dilation, dilation],
+ groups=num_groups,
+ dilation=[1, dilation, dilation],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self.b_bn = self.norm_module(num_features=dim_inner,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ # 1x1x1, BN.
+ fan = (dim_out) * (1 * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+
+ self.c = paddle.nn.Conv3D(
+ in_channels=dim_inner,
+ out_channels=dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self.c_bn = self.norm_module(
+ num_features=dim_out,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(bn_weight=0.0),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ def forward(self, x):
+ # Branch2a.
+ x = self.a(x)
+ x = self.a_bn(x)
+ x = F.relu(x)
+
+ # Branch2b.
+ x = self.b(x)
+ x = self.b_bn(x)
+ x = F.relu(x)
+
+ # Branch2c
+ x = self.c(x)
+ x = self.c_bn(x)
+ return x
+
+
+class ResBlock(paddle.nn.Layer):
+ """
+ Residual block.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups=1,
+ stride_1x1=False,
+ inplace_relu=True,
+ eps=1e-5,
+ dilation=1,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ ResBlock class constructs redisual blocks. More details can be found in:
+ Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+ "Deep residual learning for image recognition."
+ https://arxiv.org/abs/1512.03385
+ Args:
+ dim_in (int): the channel dimensions of the input.
+ dim_out (int): the channel dimension of the output.
+ temp_kernel_size (int): the temporal kernel sizes of the middle
+ convolution in the bottleneck.
+ stride (int): the stride of the bottleneck.
+ trans_func (string): transform function to be used to construct the
+ bottleneck.
+ dim_inner (int): the inner dimension of the block.
+ num_groups (int): number of groups for the convolution. num_groups=1
+ is for standard ResNet like networks, and num_groups>1 is for
+ ResNeXt like networks.
+ stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+ apply stride to the 3x3 conv.
+ inplace_relu (bool): calculate the relu on the original input
+ without allocating new memory.
+ eps (float): epsilon for batch norm.
+ dilation (int): size of dilation.
+ """
+ super(ResBlock, self).__init__()
+ self._inplace_relu = inplace_relu
+ self._eps = eps
+ self.norm_module = norm_module
+ self._construct(
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ )
+
+ def _construct(
+ self,
+ dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ ):
+ # Use skip connection with projection if dim or res change.
+ if (dim_in != dim_out) or (stride != 1):
+ fan = (dim_out) * (1 * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+ self.branch1 = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_out,
+ kernel_size=1,
+ stride=[1, stride, stride],
+ padding=0,
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False,
+ dilation=1)
+ self.branch1_bn = self.norm_module(
+ num_features=dim_out,
+ epsilon=self._eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ self.branch2 = BottleneckTransform(dim_in,
+ dim_out,
+ temp_kernel_size,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1=stride_1x1,
+ inplace_relu=inplace_relu,
+ dilation=dilation,
+ norm_module=self.norm_module)
+
+ def forward(self, x):
+ if hasattr(self, "branch1"):
+ x1 = self.branch1(x)
+ x1 = self.branch1_bn(x1)
+ x2 = self.branch2(x)
+ x = paddle.add(x=x1, y=x2)
+ else:
+ x2 = self.branch2(x)
+ x = paddle.add(x=x, y=x2)
+
+ x = F.relu(x)
+ return x
+
+
+class ResStage(paddle.nn.Layer):
+ """
+ Stage of 3D ResNet. It expects to have one or more tensors as input for
+ multi-pathway (SlowFast) cases. More details can be found here:
+
+ Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+ "Slowfast networks for video recognition."
+ https://arxiv.org/pdf/1812.03982.pdf
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ stride,
+ temp_kernel_sizes,
+ num_blocks,
+ dim_inner,
+ num_groups,
+ num_block_temp_kernel,
+ dilation,
+ stride_1x1=False,
+ inplace_relu=True,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ The `__init__` method of any subclass should also contain these arguments.
+ ResStage builds p streams, where p can be greater or equal to one.
+ Args:
+ dim_in (list): list of p the channel dimensions of the input.
+ Different channel dimensions control the input dimension of
+ different pathways.
+ dim_out (list): list of p the channel dimensions of the output.
+ Different channel dimensions control the input dimension of
+ different pathways.
+ temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+ convolution in the bottleneck. Different temp_kernel_sizes
+ control different pathway.
+ stride (list): list of the p strides of the bottleneck. Different
+ stride control different pathway.
+ num_blocks (list): list of p numbers of blocks for each of the
+ pathway.
+ dim_inner (list): list of the p inner channel dimensions of the
+ input. Different channel dimensions control the input dimension
+ of different pathways.
+ num_groups (list): list of number of p groups for the convolution.
+ num_groups=1 is for standard ResNet like networks, and
+ num_groups>1 is for ResNeXt like networks.
+ num_block_temp_kernel (list): extent the temp_kernel_sizes to
+ num_block_temp_kernel blocks, then fill temporal kernel size
+ of 1 for the rest of the layers.
+ dilation (list): size of dilation for each pathway.
+ """
+ super(ResStage, self).__init__()
+ assert all((num_block_temp_kernel[i] <= num_blocks[i]
+ for i in range(len(temp_kernel_sizes))))
+ self.num_blocks = num_blocks
+ self.temp_kernel_sizes = [
+ (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +
+ [1] * (num_blocks[i] - num_block_temp_kernel[i])
+ for i in range(len(temp_kernel_sizes))
+ ]
+ assert (len({
+ len(dim_in),
+ len(dim_out),
+ len(temp_kernel_sizes),
+ len(stride),
+ len(num_blocks),
+ len(dim_inner),
+ len(num_groups),
+ len(num_block_temp_kernel),
+ }) == 1)
+ self.num_pathways = len(self.num_blocks)
+ self.norm_module = norm_module
+ self._construct(
+ dim_in,
+ dim_out,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ )
+
+ def _construct(
+ self,
+ dim_in,
+ dim_out,
+ stride,
+ dim_inner,
+ num_groups,
+ stride_1x1,
+ inplace_relu,
+ dilation,
+ ):
+
+ for pathway in range(self.num_pathways):
+ for i in range(self.num_blocks[pathway]):
+ res_block = ResBlock(
+ dim_in[pathway] if i == 0 else dim_out[pathway],
+ dim_out[pathway],
+ self.temp_kernel_sizes[pathway][i],
+ stride[pathway] if i == 0 else 1,
+ dim_inner[pathway],
+ num_groups[pathway],
+ stride_1x1=stride_1x1,
+ inplace_relu=inplace_relu,
+ dilation=dilation[pathway],
+ norm_module=self.norm_module)
+ self.add_sublayer("pathway{}_res{}".format(pathway, i),
+ res_block)
+
+ def forward(self, inputs):
+ output = []
+ for pathway in range(self.num_pathways):
+ x = inputs[pathway]
+
+ for i in range(self.num_blocks[pathway]):
+ m = getattr(self, "pathway{}_res{}".format(pathway, i))
+ x = m(x)
+ output.append(x)
+
+ return output
+
+
+class ResNetBasicStem(paddle.nn.Layer):
+ """
+ ResNe(X)t 3D stem module.
+ Performs spatiotemporal Convolution, BN, and Relu following by a
+ spatiotemporal pooling.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ kernel,
+ stride,
+ padding,
+ eps=1e-5,
+ norm_module=paddle.nn.BatchNorm3D):
+ super(ResNetBasicStem, self).__init__()
+ self.kernel = kernel
+ self.stride = stride
+ self.padding = padding
+ self.eps = eps
+ self.norm_module = norm_module
+ self._construct_stem(dim_in, dim_out)
+
+ def _construct_stem(self, dim_in, dim_out):
+ fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])
+ initializer_tmp = get_conv_init(fan)
+
+ self._conv = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_out,
+ kernel_size=self.kernel,
+ stride=self.stride,
+ padding=self.padding,
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self._bn = self.norm_module(num_features=dim_out,
+ epsilon=self.eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ def forward(self, x):
+ x = self._conv(x)
+ x = self._bn(x)
+ x = F.relu(x)
+
+ x = F.max_pool3d(x=x,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ data_format="NCDHW")
+ return x
+
+
+class VideoModelStem(paddle.nn.Layer):
+ """
+ Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+ on input data tensor for slow and fast pathways.
+ """
+ def __init__(self,
+ dim_in,
+ dim_out,
+ kernel,
+ stride,
+ padding,
+ eps=1e-5,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ Args:
+ dim_in (list): the list of channel dimensions of the inputs.
+ dim_out (list): the output dimension of the convolution in the stem
+ layer.
+ kernel (list): the kernels' size of the convolutions in the stem
+ layers. Temporal kernel size, height kernel size, width kernel
+ size in order.
+ stride (list): the stride sizes of the convolutions in the stem
+ layer. Temporal kernel stride, height kernel size, width kernel
+ size in order.
+ padding (list): the paddings' sizes of the convolutions in the stem
+ layer. Temporal padding size, height padding size, width padding
+ size in order.
+ eps (float): epsilon for batch norm.
+ """
+ super(VideoModelStem, self).__init__()
+
+ assert (len({
+ len(dim_in),
+ len(dim_out),
+ len(kernel),
+ len(stride),
+ len(padding),
+ }) == 1), "Input pathway dimensions are not consistent."
+ self.num_pathways = len(dim_in)
+ self.kernel = kernel
+ self.stride = stride
+ self.padding = padding
+ self.eps = eps
+ self.norm_module = norm_module
+ self._construct_stem(dim_in, dim_out)
+
+ def _construct_stem(self, dim_in, dim_out):
+ for pathway in range(len(dim_in)):
+ stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],
+ self.kernel[pathway], self.stride[pathway],
+ self.padding[pathway], self.eps,
+ self.norm_module)
+ self.add_sublayer("pathway{}_stem".format(pathway), stem)
+
+ def forward(self, x):
+ assert (len(x) == self.num_pathways
+ ), "Input tensor does not contain {} pathway".format(
+ self.num_pathways)
+
+ for pathway in range(len(x)):
+ m = getattr(self, "pathway{}_stem".format(pathway))
+ x[pathway] = m(x[pathway])
+
+ return x
+
+
+class FuseFastToSlow(paddle.nn.Layer):
+ """
+ Fuses the information from the Fast pathway to the Slow pathway. Given the
+ tensors from Slow pathway and Fast pathway, fuse information from Fast to
+ Slow, then return the fused tensors from Slow and Fast pathway in order.
+ """
+ def __init__(self,
+ dim_in,
+ fusion_conv_channel_ratio,
+ fusion_kernel,
+ alpha,
+ fuse_bn_relu=1,
+ eps=1e-5,
+ norm_module=paddle.nn.BatchNorm3D):
+ """
+ Args:
+ dim_in (int): the channel dimension of the input.
+ fusion_conv_channel_ratio (int): channel ratio for the convolution
+ used to fuse from Fast pathway to Slow pathway.
+ fusion_kernel (int): kernel size of the convolution used to fuse
+ from Fast pathway to Slow pathway.
+ alpha (int): the frame rate ratio between the Fast and Slow pathway.
+ eps (float): epsilon for batch norm.
+ """
+ super(FuseFastToSlow, self).__init__()
+ self.fuse_bn_relu = fuse_bn_relu
+ fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
+ initializer_tmp = get_conv_init(fan)
+
+ self._conv_f2s = paddle.nn.Conv3D(
+ in_channels=dim_in,
+ out_channels=dim_in * fusion_conv_channel_ratio,
+ kernel_size=[fusion_kernel, 1, 1],
+ stride=[alpha, 1, 1],
+ padding=[fusion_kernel // 2, 0, 0],
+ weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+ bias_attr=False)
+ self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
+ epsilon=eps,
+ weight_attr=get_bn_param_attr(),
+ bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+ def forward(self, x):
+ x_s = x[0]
+ x_f = x[1]
+ fuse = self._conv_f2s(x_f)
+ # TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.
+ if self.fuse_bn_relu:
+ fuse = self._bn(fuse)
+ fuse = F.relu(fuse)
+ x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
+
+ return [x_s_fuse, x_f]
+
+
+@BACKBONES.register()
+class ResNetSlowFast_MRI(paddle.nn.Layer):
+ """
+ SlowFast model builder for SlowFast network.
+
+ Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+ "Slowfast networks for video recognition."
+ https://arxiv.org/pdf/1812.03982.pdf
+ """
+ def __init__(
+ self,
+ alpha,
+ beta,
+ bn_norm_type="batchnorm",
+ bn_num_splits=1,
+ num_pathways=2,
+ depth=50,
+ num_groups=1,
+ input_channel_num=[1, 1],
+ width_per_group=64,
+ fusion_conv_channel_ratio=2,
+ fusion_kernel_sz=7, #5?
+ pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+ fuse_bn_relu=1,
+ spatial_strides=[[1, 1], [2, 2], [2, 2], [2, 2]],
+ use_pool_af_s2=1,
+ ):
+ """
+ Args:
+ cfg (CfgNode): model building configs, details are in the
+ comments of the config file.
+ """
+ super(ResNetSlowFast_MRI, self).__init__()
+
+ self.alpha = alpha #8
+ self.beta = beta #8
+ self.norm_module = get_norm(bn_norm_type, bn_num_splits)
+ self.num_pathways = num_pathways
+ self.depth = depth
+ self.num_groups = num_groups
+ self.input_channel_num = input_channel_num
+ self.width_per_group = width_per_group
+ self.fusion_conv_channel_ratio = fusion_conv_channel_ratio
+ self.fusion_kernel_sz = fusion_kernel_sz # NOTE: modify to 7 in 8*8, 5 in old implement
+ self.pool_size_ratio = pool_size_ratio
+ self.fuse_bn_relu = fuse_bn_relu
+ self.spatial_strides = spatial_strides
+ self.use_pool_af_s2 = use_pool_af_s2
+ self._construct_network()
+
+ def _construct_network(self):
+ """
+ Builds a SlowFast model.
+ The first pathway is the Slow pathway
+ and the second pathway is the Fast pathway.
+
+ Args:
+ cfg (CfgNode): model building configs, details are in the
+ comments of the config file.
+ """
+ temp_kernel = [
+ [[1], [5]], # conv1 temporal kernel for slow and fast pathway.
+ [[1], [3]], # res2 temporal kernel for slow and fast pathway.
+ [[1], [3]], # res3 temporal kernel for slow and fast pathway.
+ [[3], [3]], # res4 temporal kernel for slow and fast pathway.
+ [[3], [3]],
+ ] # res5 temporal kernel for slow and fast pathway.
+
+ self.s1 = VideoModelStem(
+ dim_in=self.input_channel_num,
+ dim_out=[self.width_per_group, self.width_per_group // self.beta],
+ kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],
+ stride=[[1, 2, 2]] * 2,
+ padding=[
+ [temp_kernel[0][0][0] // 2, 3, 3],
+ [temp_kernel[0][1][0] // 2, 3, 3],
+ ],
+ norm_module=self.norm_module)
+ self.s1_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu)
+
+ # ResNet backbone
+ MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}
+ (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]
+
+ num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]
+ spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]
+ spatial_strides = self.spatial_strides
+ #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]
+ #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment
+
+ out_dim_ratio = self.beta // self.fusion_conv_channel_ratio #4
+ dim_inner = self.width_per_group * self.num_groups #64
+
+ self.s2 = ResStage(dim_in=[
+ self.width_per_group + self.width_per_group // out_dim_ratio,
+ self.width_per_group // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 4,
+ self.width_per_group * 4 // self.beta,
+ ],
+ dim_inner=[dim_inner, dim_inner // self.beta],
+ temp_kernel_sizes=temp_kernel[1],
+ stride=spatial_strides[0],
+ num_blocks=[d2] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[0],
+ dilation=spatial_dilations[0],
+ norm_module=self.norm_module)
+
+ self.s2_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group * 4 // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu,
+ )
+
+ self.s3 = ResStage(
+ dim_in=[
+ self.width_per_group * 4 +
+ self.width_per_group * 4 // out_dim_ratio,
+ self.width_per_group * 4 // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 8,
+ self.width_per_group * 8 // self.beta,
+ ],
+ dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],
+ temp_kernel_sizes=temp_kernel[2],
+ stride=spatial_strides[1],
+ num_blocks=[d3] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[1],
+ dilation=spatial_dilations[1],
+ norm_module=self.norm_module,
+ )
+
+ self.s3_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group * 8 // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu,
+ )
+
+ self.s4 = ResStage(
+ dim_in=[
+ self.width_per_group * 8 +
+ self.width_per_group * 8 // out_dim_ratio,
+ self.width_per_group * 8 // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 16,
+ self.width_per_group * 16 // self.beta,
+ ],
+ dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],
+ temp_kernel_sizes=temp_kernel[3],
+ stride=spatial_strides[2],
+ num_blocks=[d4] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[2],
+ dilation=spatial_dilations[2],
+ norm_module=self.norm_module,
+ )
+
+ self.s4_fuse = FuseFastToSlow(
+ dim_in=self.width_per_group * 16 // self.beta,
+ fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+ fusion_kernel=self.fusion_kernel_sz,
+ alpha=self.alpha,
+ norm_module=self.norm_module,
+ fuse_bn_relu=self.fuse_bn_relu,
+ )
+
+ self.s5 = ResStage(
+ dim_in=[
+ self.width_per_group * 16 +
+ self.width_per_group * 16 // out_dim_ratio,
+ self.width_per_group * 16 // self.beta,
+ ],
+ dim_out=[
+ self.width_per_group * 32,
+ self.width_per_group * 32 // self.beta,
+ ],
+ dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],
+ temp_kernel_sizes=temp_kernel[4],
+ stride=spatial_strides[3],
+ num_blocks=[d5] * 2,
+ num_groups=[self.num_groups] * 2,
+ num_block_temp_kernel=num_block_temp_kernel[3],
+ dilation=spatial_dilations[3],
+ norm_module=self.norm_module,
+ )
+
+ def init_weights(self):
+ pass
+
+ def forward(self, x):
+ x = self.s1(x) #VideoModelStem
+ x = self.s1_fuse(x) #FuseFastToSlow
+ x = self.s2(x) #ResStage
+ x = self.s2_fuse(x)
+
+ # TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.
+ if self.use_pool_af_s2:
+ for pathway in range(self.num_pathways):
+ x[pathway] = F.max_pool3d(
+ x=x[pathway],
+ kernel_size=self.pool_size_ratio[pathway],
+ stride=self.pool_size_ratio[pathway],
+ padding=[0, 0, 0],
+ data_format="NCDHW")
+
+ x = self.s3(x)
+ x = self.s3_fuse(x)
+ x = self.s4(x)
+ x = self.s4_fuse(x)
+ x = self.s5(x)
+ return x
diff --git a/paddlevideo/modeling/backbones/resnet_tsm.py b/paddlevideo/modeling/backbones/resnet_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fa5093e8021076153ed288a4a9e537911c8738a
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet_tsm.py
@@ -0,0 +1,340 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+ AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+ """Conv2D and BatchNorm2D layer.
+
+ Args:
+ in_channels (int): Number of channels for the input.
+ out_channels (int): Number of channels for the output.
+ kernel_size (int): Kernel size.
+ stride (int): Stride in the Conv2D layer. Default: 1.
+ groups (int): Groups in the Conv2D, Default: 1.
+ act (str): Indicate activation after BatchNorm2D layer.
+ name (str): the name of an instance of ConvBNLayer.
+ Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ groups=1,
+ act=None,
+ name=None,
+ data_format="NCHW"):
+ super(ConvBNLayer, self).__init__()
+ self._conv = Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=(kernel_size - 1) // 2,
+ groups=groups,
+ weight_attr=ParamAttr(name=name + "_weights"),
+ bias_attr=False,
+ data_format=data_format)
+ if name == "conv1":
+ bn_name = "bn_" + name
+ else:
+ bn_name = "bn" + name[3:]
+
+ self._act = act
+
+ self._batch_norm = BatchNorm2D(
+ out_channels,
+ weight_attr=ParamAttr(name=bn_name + "_scale",
+ regularizer=L2Decay(0.0)),
+ bias_attr=ParamAttr(name=bn_name + "_offset",
+ regularizer=L2Decay(0.0)),
+ data_format=data_format)
+
+ def forward(self, inputs):
+ y = self._conv(inputs)
+ y = self._batch_norm(y)
+ if self._act:
+ y = getattr(paddle.nn.functional, self._act)(y)
+ return y
+
+
+class BottleneckBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ num_seg=8,
+ name=None,
+ data_format="NCHW"):
+ super(BottleneckBlock, self).__init__()
+ self.data_format = data_format
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ act="relu",
+ name=name + "_branch2a",
+ data_format=data_format)
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act="relu",
+ name=name + "_branch2b",
+ data_format=data_format)
+
+ self.conv2 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ act=None,
+ name=name + "_branch2c",
+ data_format=data_format)
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ stride=stride,
+ name=name + "_branch1",
+ data_format=data_format)
+
+ self.shortcut = shortcut
+ self.num_seg = num_seg
+
+ def forward(self, inputs):
+ if paddle.fluid.core.is_compiled_with_npu():
+ x = inputs
+ seg_num = self.num_seg
+ shift_ratio = 1.0 / self.num_seg
+
+ shape = x.shape #[N*T, C, H, W]
+ reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3])) #[N, T, C, H, W]
+ pad_x = paddle.fluid.layers.pad(reshape_x, [0,0,1,1,0,0,0,0,0,0,]) #[N, T+2, C, H, W]
+ c1 = int(shape[1] * shift_ratio)
+ c2 = int(shape[1] * 2 * shift_ratio)
+ slice1 = pad_x[:, :seg_num, :c1, :, :]
+ slice2 = pad_x[:, 2:seg_num+2, c1:c2, :, :]
+ slice3 = pad_x[:, 1:seg_num+1, c2:, :, :]
+ concat_x = paddle.concat([slice1, slice2, slice3], axis=2) #[N, T, C, H, W]
+ shifts = concat_x.reshape(shape)
+ else:
+ shifts = F.temporal_shift(inputs,
+ self.num_seg,
+ 1.0 / self.num_seg,
+ data_format=self.data_format)
+
+ y = self.conv0(shifts)
+ conv1 = self.conv1(y)
+ conv2 = self.conv2(conv1)
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv2)
+ return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ name=None,
+ data_format="NCHW"):
+ super(BasicBlock, self).__init__()
+ self.stride = stride
+ self.conv0 = ConvBNLayer(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ stride=stride,
+ act="relu",
+ name=name + "_branch2a",
+ data_format=data_format,
+ )
+ self.conv1 = ConvBNLayer(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ act=None,
+ name=name + "_branch2b",
+ data_format=data_format,
+ )
+
+ if not shortcut:
+ self.short = ConvBNLayer(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=1,
+ stride=stride,
+ name=name + "_branch1",
+ data_format=data_format,
+ )
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(short, conv1)
+ y = F.relu(y)
+ return y
+
+
+@BACKBONES.register()
+class ResNetTSM(nn.Layer):
+ """ResNet TSM backbone.
+
+ Args:
+ depth (int): Depth of resnet model.
+ pretrained (str): pretrained model. Default: None.
+ """
+ def __init__(self, depth, num_seg=8, data_format="NCHW", pretrained=None):
+ super(ResNetTSM, self).__init__()
+ self.pretrained = pretrained
+ self.layers = depth
+ self.num_seg = num_seg
+ self.data_format = data_format
+
+ supported_layers = [18, 34, 50, 101, 152]
+ assert self.layers in supported_layers, \
+ "supported layers are {} but input layer is {}".format(
+ supported_layers, self.layers)
+
+ if self.layers == 18:
+ depth = [2, 2, 2, 2]
+ elif self.layers == 34 or self.layers == 50:
+ depth = [3, 4, 6, 3]
+ elif self.layers == 101:
+ depth = [3, 4, 23, 3]
+ elif self.layers == 152:
+ depth = [3, 8, 36, 3]
+
+ in_channels = 64
+ out_channels = [64, 128, 256, 512]
+
+ self.conv = ConvBNLayer(in_channels=3,
+ out_channels=64,
+ kernel_size=7,
+ stride=2,
+ act="relu",
+ name="conv1",
+ data_format=self.data_format)
+ self.pool2D_max = MaxPool2D(
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ data_format=self.data_format,
+ )
+
+ self.block_list = []
+ if self.layers >= 50:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ if self.layers in [101, 152] and block == 2:
+ if i == 0:
+ conv_name = "res" + str(block + 2) + "a"
+ else:
+ conv_name = "res" + str(block + 2) + "b" + str(i)
+ else:
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ bottleneck_block = self.add_sublayer(
+ conv_name,
+ BottleneckBlock(
+ in_channels=in_channels
+ if i == 0 else out_channels[block] * 4,
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ num_seg=self.num_seg,
+ shortcut=shortcut,
+ name=conv_name,
+ data_format=self.data_format))
+ in_channels = out_channels[block] * 4
+ self.block_list.append(bottleneck_block)
+ shortcut = True
+ else:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ basic_block = self.add_sublayer(
+ conv_name,
+ BasicBlock(
+ in_channels=in_channels[block]
+ if i == 0 else out_channels[block],
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ name=conv_name,
+ data_format=self.data_format,
+ ))
+ self.block_list.append(basic_block)
+ shortcut = True
+
+ def init_weights(self):
+ """Initiate the parameters.
+ Note:
+ 1. when indicate pretrained loading path, will load it to initiate backbone.
+ 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+ Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+ """
+ #XXX: check bias!!! check pretrained!!!
+
+ if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ #XXX: no bias
+ weight_init_(layer, 'KaimingNormal')
+ elif isinstance(layer, nn.BatchNorm2D):
+ weight_init_(layer, 'Constant', value=1)
+
+ def forward(self, inputs):
+ """Define how the backbone is going to run.
+
+ """
+ #NOTE: (deprecated design) Already merge axis 0(batches) and axis 1(clips) before extracting feature phase,
+ # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+ #y = paddle.reshape(
+ # inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+ #NOTE: As paddlepaddle to_static method need a "pure" model to trim. It means from
+ # 1. the phase of generating data[images, label] from dataloader
+ # to
+ # 2. last layer of a model, always is FC layer
+
+ y = self.conv(inputs)
+ y = self.pool2D_max(y)
+ for block in self.block_list:
+ y = block(y)
+ return y
diff --git a/paddlevideo/modeling/backbones/resnet_tsm_MRI.py b/paddlevideo/modeling/backbones/resnet_tsm_MRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae4fdc70f99ddeee44adaeb06e8b9dbccff0768e
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet_tsm_MRI.py
@@ -0,0 +1,328 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import sys
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+ AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils.save_load import load_ckpt
+from paddle.regularizer import L2Decay
+
+
+class ConvBNLayer(nn.Layer):
+ """Conv2D and BatchNorm2D layer.
+
+ Args:
+ in_channels (int): Number of channels for the input.
+ out_channels (int): Number of channels for the output.
+ kernel_size (int): Kernel size.
+ stride (int): Stride in the Conv2D layer. Default: 1.
+ groups (int): Groups in the Conv2D, Default: 1.
+ is_tweaks_mode (bool): switch for tweaks. Default: False.
+ act (str): Indicate activation after BatchNorm2D layer.
+ name (str): the name of an instance of ConvBNLayer.
+
+ Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ groups=1,
+ is_tweaks_mode=False,
+ act=None,
+ name=None):
+ super(ConvBNLayer, self).__init__()
+ self.is_tweaks_mode = is_tweaks_mode
+ #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+ # whose stride is changed to 1, works well in practice.
+ self._pool2d_avg = AvgPool2D(kernel_size=2,
+ stride=2,
+ padding=0,
+ ceil_mode=True)
+
+ self._conv = Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=(kernel_size - 1) // 2,
+ groups=groups,
+ weight_attr=ParamAttr(name=name + "_weights"),
+ bias_attr=False)
+ if name == "conv1":
+ bn_name = "bn_" + name
+ else:
+ bn_name = "bn" + name[3:]
+
+ self._act = act
+
+ self._batch_norm = BatchNorm2D(
+ out_channels,
+ weight_attr=ParamAttr(name=bn_name + "_scale",
+ regularizer=L2Decay(0.0)),
+ bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0)))
+
+ def forward(self, inputs):
+ if self.is_tweaks_mode:
+ inputs = self._pool2d_avg(inputs)
+ y = self._conv(inputs)
+ y = self._batch_norm(y)
+ if self._act:
+ y = getattr(paddle.nn.functional, self._act)(y)
+ return y
+
+
+class BottleneckBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ if_first=False,
+ num_seg=8,
+ name=None):
+ super(BottleneckBlock, self).__init__()
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ act="leaky_relu",
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act="leaky_relu",
+ name=name + "_branch2b")
+
+ self.conv2 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ act=None,
+ name=name + "_branch2c")
+
+ if not shortcut:
+ self.short = ConvBNLayer(
+ in_channels=in_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ stride=
+ 1, #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+ # whose stride is changed to 1, works well in practice.
+ is_tweaks_mode=False if if_first else True,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+ self.num_seg = num_seg
+
+ def forward(self, inputs):
+ shifts = paddle.fluid.layers.temporal_shift(inputs, self.num_seg,
+ 1.0 / self.num_seg)
+ y = self.conv0(shifts)
+ conv1 = self.conv1(y)
+ conv2 = self.conv2(conv1)
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv2)
+ return F.leaky_relu(y)
+
+
+class BasicBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ name=None):
+ super(BasicBlock, self).__init__()
+ self.stride = stride
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ stride=stride,
+ act="leaky_relu",
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ act=None,
+ name=name + "_branch2b")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=1,
+ stride=stride,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(short, conv1)
+ y = F.leaky_relu(y)
+ return y
+
+
+@BACKBONES.register()
+class ResNetTSM_MRI(nn.Layer):
+ """ResNet TSM backbone.
+
+ Args:
+ depth (int): Depth of resnet model.
+ pretrained (str): pretrained model. Default: None.
+ """
+ def __init__(self, depth, num_seg=8, pretrained=None, in_channels=1):
+ super(ResNetTSM_MRI, self).__init__()
+ self.pretrained = pretrained
+ self.layers = depth
+ self.num_seg = num_seg
+ self.in_channels = in_channels
+
+ supported_layers = [18, 34, 50, 101, 152]
+ assert self.layers in supported_layers, \
+ "supported layers are {} but input layer is {}".format(
+ supported_layers, self.layers)
+
+ if self.layers == 18:
+ depth = [2, 2, 2, 2]
+ elif self.layers == 34 or self.layers == 50:
+ depth = [3, 4, 6, 3]
+ elif self.layers == 101:
+ depth = [3, 4, 23, 3]
+ elif self.layers == 152:
+ depth = [3, 8, 36, 3]
+
+ in_channels = 64
+ out_channels = [64, 128, 256, 512]
+
+ #ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+ self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,
+ out_channels=32,
+ kernel_size=3,
+ stride=2,
+ act='leaky_relu',
+ name="conv1_1")
+ self.conv1_2 = ConvBNLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ act='leaky_relu',
+ name="conv1_2")
+ self.conv1_3 = ConvBNLayer(in_channels=32,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ act='leaky_relu',
+ name="conv1_3")
+ self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+ self.block_list = []
+ if self.layers >= 50:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ if self.layers in [101, 152] and block == 2:
+ if i == 0:
+ conv_name = "res" + str(block + 2) + "a"
+ else:
+ conv_name = "res" + str(block + 2) + "b" + str(i)
+ else:
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ bottleneck_block = self.add_sublayer(
+ 'bb_%d_%d' %
+ (block, i), #same with PaddleClas, for loading pretrain
+ BottleneckBlock(
+ in_channels=in_channels
+ if i == 0 else out_channels[block] * 4,
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ num_seg=self.num_seg,
+ shortcut=shortcut,
+ if_first=block == i == 0,
+ name=conv_name))
+ in_channels = out_channels[block] * 4
+ self.block_list.append(bottleneck_block)
+ shortcut = True
+ else:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ basic_block = self.add_sublayer(
+ conv_name,
+ BasicBlock(in_channels=in_channels[block]
+ if i == 0 else out_channels[block],
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ name=conv_name))
+ self.block_list.append(basic_block)
+ shortcut = True
+
+ def init_weights(self):
+ """Initiate the parameters.
+ Note:
+ 1. when indicate pretrained loading path, will load it to initiate backbone.
+ 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+ Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+ """
+ #XXX: check bias!!! check pretrained!!!
+
+ if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ #XXX: no bias
+ weight_init_(layer, 'KaimingNormal')
+ elif isinstance(layer, nn.BatchNorm2D):
+ weight_init_(layer, 'Constant', value=1)
+
+ def forward(self, inputs):
+ """Define how the backbone is going to run.
+
+ """
+ #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+ # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+ #y = paddle.reshape(
+ # inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+ ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+ y = self.conv1_1(inputs)
+ y = self.conv1_2(y)
+ y = self.conv1_3(y)
+
+ y = self.pool2D_max(y)
+ for block in self.block_list:
+ y = block(y)
+ return y
diff --git a/paddlevideo/modeling/backbones/resnet_tsn_MRI.py b/paddlevideo/modeling/backbones/resnet_tsn_MRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..439a0eff84a36dafb46f68fd529b183e7b7760be
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet_tsn_MRI.py
@@ -0,0 +1,331 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from paddle.nn import Conv2D, BatchNorm
+from paddle.nn import MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+__all__ = ["ResNetTSN_MRI"]
+
+
+class ConvBNLayer(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ groups=1,
+ is_tweaks_mode=False,
+ act=None,
+ lr_mult=1.0,
+ name=None):
+ super(ConvBNLayer, self).__init__()
+ self.is_tweaks_mode = is_tweaks_mode
+ self._pool2d_avg = AvgPool2D(kernel_size=2,
+ stride=2,
+ padding=0,
+ ceil_mode=True)
+ self._conv = Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=(kernel_size - 1) // 2,
+ groups=groups,
+ weight_attr=ParamAttr(name=name + "_weights",
+ learning_rate=lr_mult),
+ bias_attr=False)
+ if name == "conv1":
+ bn_name = "bn_" + name
+ else:
+ bn_name = "bn" + name[3:]
+ self._batch_norm = BatchNorm(
+ out_channels,
+ act=act,
+ param_attr=ParamAttr(name=bn_name + '_scale',
+ learning_rate=lr_mult,
+ regularizer=L2Decay(0.0)),
+ bias_attr=ParamAttr(bn_name + '_offset',
+ learning_rate=lr_mult,
+ regularizer=L2Decay(0.0)),
+ moving_mean_name=bn_name + '_mean',
+ moving_variance_name=bn_name + '_variance')
+
+ def forward(self, inputs):
+ if self.is_tweaks_mode:
+ inputs = self._pool2d_avg(inputs)
+ y = self._conv(inputs)
+ y = self._batch_norm(y)
+ return y
+
+
+class BottleneckBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ if_first=False,
+ lr_mult=1.0,
+ name=None):
+ super(BottleneckBlock, self).__init__()
+
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ act='relu',
+ lr_mult=lr_mult,
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act='relu',
+ lr_mult=lr_mult,
+ name=name + "_branch2b")
+ self.conv2 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ act=None,
+ lr_mult=lr_mult,
+ name=name + "_branch2c")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ stride=1,
+ is_tweaks_mode=False if if_first else True,
+ lr_mult=lr_mult,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+ conv2 = self.conv2(conv1)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv2)
+ y = F.relu(y)
+ return y
+
+
+class BasicBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ if_first=False,
+ lr_mult=1.0,
+ name=None):
+ super(BasicBlock, self).__init__()
+ self.stride = stride
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act='relu',
+ lr_mult=lr_mult,
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ act=None,
+ lr_mult=lr_mult,
+ name=name + "_branch2b")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=1,
+ is_tweaks_mode=False if if_first else True,
+ lr_mult=lr_mult,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv1)
+ y = F.relu(y)
+ return y
+
+
+@BACKBONES.register()
+class ResNetTSN_MRI(nn.Layer):
+ """ResNetTweaksTSN backbone.
+
+ Args:
+ depth (int): Depth of resnet model.
+ pretrained (str): pretrained model. Default: None.
+ """
+ def __init__(self,
+ layers=50,
+ pretrained=None,
+ lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+ in_channels=1):
+ super(ResNetTSN_MRI, self).__init__()
+
+ self.pretrained = pretrained
+ self.layers = layers
+ supported_layers = [18, 34, 50, 101, 152, 200]
+ assert layers in supported_layers, \
+ "supported layers are {} but input layer is {}".format(
+ supported_layers, layers)
+
+ self.lr_mult_list = lr_mult_list
+ self.in_channels = in_channels
+ assert isinstance(
+ self.lr_mult_list,
+ (list, tuple
+ )), "lr_mult_list should be in (list, tuple) but got {}".format(
+ type(self.lr_mult_list))
+ assert len(
+ self.lr_mult_list
+ ) == 5, "lr_mult_list length should should be 5 but got {}".format(
+ len(self.lr_mult_list))
+
+ if layers == 18:
+ depth = [2, 2, 2, 2]
+ elif layers == 34 or layers == 50:
+ depth = [3, 4, 6, 3]
+ elif layers == 101:
+ depth = [3, 4, 23, 3]
+ elif layers == 152:
+ depth = [3, 8, 36, 3]
+ elif layers == 200:
+ depth = [3, 12, 48, 3]
+ num_channels = [64, 256, 512, 1024
+ ] if layers >= 50 else [64, 64, 128, 256]
+ num_filters = [64, 128, 256, 512]
+
+ self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,
+ out_channels=32,
+ kernel_size=3,
+ stride=2,
+ act='relu',
+ lr_mult=self.lr_mult_list[0],
+ name="conv1_1")
+ self.conv1_2 = ConvBNLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ act='relu',
+ lr_mult=self.lr_mult_list[0],
+ name="conv1_2")
+ self.conv1_3 = ConvBNLayer(in_channels=32,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ act='relu',
+ lr_mult=self.lr_mult_list[0],
+ name="conv1_3")
+ self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+ self.block_list = []
+ if layers >= 50:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ if layers in [101, 152, 200] and block == 2:
+ if i == 0:
+ conv_name = "res" + str(block + 2) + "a"
+ else:
+ conv_name = "res" + str(block + 2) + "b" + str(i)
+ else:
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ bottleneck_block = self.add_sublayer(
+ 'bb_%d_%d' % (block, i),
+ BottleneckBlock(
+ in_channels=num_channels[block]
+ if i == 0 else num_filters[block] * 4,
+ out_channels=num_filters[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ if_first=block == i == 0,
+ lr_mult=self.lr_mult_list[block + 1],
+ name=conv_name))
+ self.block_list.append(bottleneck_block)
+ shortcut = True
+ else:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ basic_block = self.add_sublayer(
+ 'bb_%d_%d' % (block, i),
+ BasicBlock(in_channels=num_channels[block]
+ if i == 0 else num_filters[block],
+ out_channels=num_filters[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ if_first=block == i == 0,
+ name=conv_name,
+ lr_mult=self.lr_mult_list[block + 1]))
+ self.block_list.append(basic_block)
+ shortcut = True
+
+ def init_weights(self):
+ """Initiate the parameters.
+ Note:
+ 1. when indicate pretrained loading path, will load it to initiate backbone.
+ 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be
+ initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+ Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+ """
+ # XXX: check bias!!! check pretrained!!!
+
+ if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ # XXX: no bias
+ weight_init_(layer, 'KaimingNormal')
+ elif isinstance(layer, nn.BatchNorm2D):
+ weight_init_(layer, 'Constant', value=1)
+
+ def forward(self, inputs):
+
+ y = self.conv1_1(inputs)
+ y = self.conv1_2(y)
+ y = self.conv1_3(y)
+ y = self.pool2d_max(y)
+ for block in self.block_list:
+ y = block(y)
+ return y
diff --git a/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py b/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ed947c57d6f2cb7ec616ccc31731e2ffba9c55
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
@@ -0,0 +1,344 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import sys
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+ AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils.save_load import load_ckpt
+from paddle.regularizer import L2Decay
+
+
+class ConvBNLayer(nn.Layer):
+ """Conv2D and BatchNorm2D layer.
+
+ Args:
+ in_channels (int): Number of channels for the input.
+ out_channels (int): Number of channels for the output.
+ kernel_size (int): Kernel size.
+ stride (int): Stride in the Conv2D layer. Default: 1.
+ groups (int): Groups in the Conv2D, Default: 1.
+ is_tweaks_mode (bool): switch for tweaks. Default: False.
+ act (str): Indicate activation after BatchNorm2D layer.
+ name (str): the name of an instance of ConvBNLayer.
+
+ Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ groups=1,
+ is_tweaks_mode=False,
+ act=None,
+ name=None):
+ super(ConvBNLayer, self).__init__()
+ self.is_tweaks_mode = is_tweaks_mode
+ #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+ # whose stride is changed to 1, works well in practice.
+ self._pool2d_avg = AvgPool2D(kernel_size=2,
+ stride=2,
+ padding=0,
+ ceil_mode=True)
+
+ self._conv = Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=(kernel_size - 1) // 2,
+ groups=groups,
+ weight_attr=ParamAttr(name=name + "_weights"),
+ bias_attr=False)
+ if name == "conv1":
+ bn_name = "bn_" + name
+ else:
+ bn_name = "bn" + name[3:]
+
+ self._act = act
+
+ self._batch_norm = BatchNorm2D(
+ out_channels,
+ weight_attr=ParamAttr(name=bn_name + "_scale",
+ regularizer=L2Decay(0.0)),
+ bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0)))
+
+ def forward(self, inputs):
+ if self.is_tweaks_mode:
+ inputs = self._pool2d_avg(inputs)
+ y = self._conv(inputs)
+ y = self._batch_norm(y)
+ if self._act:
+ y = getattr(paddle.nn.functional, self._act)(y)
+ return y
+
+
+class BottleneckBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ if_first=False,
+ num_seg=8,
+ name=None):
+ super(BottleneckBlock, self).__init__()
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ act="leaky_relu",
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act="leaky_relu",
+ name=name + "_branch2b")
+
+ self.conv2 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ act=None,
+ name=name + "_branch2c")
+
+ if not shortcut:
+ self.short = ConvBNLayer(
+ in_channels=in_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ stride=
+ 1, #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+ # whose stride is changed to 1, works well in practice.
+ is_tweaks_mode=False if if_first else True,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+ self.num_seg = num_seg
+
+ def forward(self, inputs):
+ if paddle.fluid.core.is_compiled_with_npu():
+ x = inputs
+ seg_num = self.num_seg
+ shift_ratio = 1.0 / self.num_seg
+
+ shape = x.shape #[N*T, C, H, W]
+ reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3])) #[N, T, C, H, W]
+ pad_x = paddle.fluid.layers.pad(reshape_x, [0,0,1,1,0,0,0,0,0,0,]) #[N, T+2, C, H, W]
+ c1 = int(shape[1] * shift_ratio)
+ c2 = int(shape[1] * 2 * shift_ratio)
+ slice1 = pad_x[:, :seg_num, :c1, :, :]
+ slice2 = pad_x[:, 2:seg_num+2, c1:c2, :, :]
+ slice3 = pad_x[:, 1:seg_num+1, c2:, :, :]
+ concat_x = paddle.concat([slice1, slice2, slice3], axis=2) #[N, T, C, H, W]
+ shifts = concat_x.reshape(shape)
+ else:
+ shifts = paddle.fluid.layers.temporal_shift(inputs, self.num_seg,
+ 1.0 / self.num_seg)
+
+ y = self.conv0(shifts)
+ conv1 = self.conv1(y)
+ conv2 = self.conv2(conv1)
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv2)
+ return F.leaky_relu(y)
+
+
+class BasicBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ name=None):
+ super(BasicBlock, self).__init__()
+ self.stride = stride
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ stride=stride,
+ act="leaky_relu",
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ act=None,
+ name=name + "_branch2b")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=1,
+ stride=stride,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(short, conv1)
+ y = F.leaky_relu(y)
+ return y
+
+
+@BACKBONES.register()
+class ResNetTweaksTSM(nn.Layer):
+ """ResNet TSM backbone.
+
+ Args:
+ depth (int): Depth of resnet model.
+ pretrained (str): pretrained model. Default: None.
+ """
+ def __init__(self, depth, num_seg=8, pretrained=None):
+ super(ResNetTweaksTSM, self).__init__()
+ self.pretrained = pretrained
+ self.layers = depth
+ self.num_seg = num_seg
+
+ supported_layers = [18, 34, 50, 101, 152]
+ assert self.layers in supported_layers, \
+ "supported layers are {} but input layer is {}".format(
+ supported_layers, self.layers)
+
+ if self.layers == 18:
+ depth = [2, 2, 2, 2]
+ elif self.layers == 34 or self.layers == 50:
+ depth = [3, 4, 6, 3]
+ elif self.layers == 101:
+ depth = [3, 4, 23, 3]
+ elif self.layers == 152:
+ depth = [3, 8, 36, 3]
+
+ in_channels = 64
+ out_channels = [64, 128, 256, 512]
+
+ #ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+ self.conv1_1 = ConvBNLayer(in_channels=3,
+ out_channels=32,
+ kernel_size=3,
+ stride=2,
+ act='leaky_relu',
+ name="conv1_1")
+ self.conv1_2 = ConvBNLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ act='leaky_relu',
+ name="conv1_2")
+ self.conv1_3 = ConvBNLayer(in_channels=32,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ act='leaky_relu',
+ name="conv1_3")
+ self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+ self.block_list = []
+ if self.layers >= 50:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ if self.layers in [101, 152] and block == 2:
+ if i == 0:
+ conv_name = "res" + str(block + 2) + "a"
+ else:
+ conv_name = "res" + str(block + 2) + "b" + str(i)
+ else:
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ bottleneck_block = self.add_sublayer(
+ 'bb_%d_%d' %
+ (block, i), #same with PaddleClas, for loading pretrain
+ BottleneckBlock(
+ in_channels=in_channels
+ if i == 0 else out_channels[block] * 4,
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ num_seg=self.num_seg,
+ shortcut=shortcut,
+ if_first=block == i == 0,
+ name=conv_name))
+ in_channels = out_channels[block] * 4
+ self.block_list.append(bottleneck_block)
+ shortcut = True
+ else:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ basic_block = self.add_sublayer(
+ conv_name,
+ BasicBlock(in_channels=in_channels[block]
+ if i == 0 else out_channels[block],
+ out_channels=out_channels[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ name=conv_name))
+ self.block_list.append(basic_block)
+ shortcut = True
+
+ def init_weights(self):
+ """Initiate the parameters.
+ Note:
+ 1. when indicate pretrained loading path, will load it to initiate backbone.
+ 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+ Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+ """
+ #XXX: check bias!!! check pretrained!!!
+
+ if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ #XXX: no bias
+ weight_init_(layer, 'KaimingNormal')
+ elif isinstance(layer, nn.BatchNorm2D):
+ weight_init_(layer, 'Constant', value=1)
+
+ def forward(self, inputs):
+ """Define how the backbone is going to run.
+
+ """
+ #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+ # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+ #y = paddle.reshape(
+ # inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+ ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+ y = self.conv1_1(inputs)
+ y = self.conv1_2(y)
+ y = self.conv1_3(y)
+
+ y = self.pool2D_max(y)
+ for block in self.block_list:
+ y = block(y)
+ return y
diff --git a/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py b/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py
new file mode 100644
index 0000000000000000000000000000000000000000..36b33073f76506a310a250b80cfb49df07b4e613
--- /dev/null
+++ b/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py
@@ -0,0 +1,328 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from paddle.nn import Conv2D, BatchNorm
+from paddle.nn import MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+__all__ = ["ResNetTweaksTSN"]
+
+
+class ConvBNLayer(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ groups=1,
+ is_tweaks_mode=False,
+ act=None,
+ lr_mult=1.0,
+ name=None):
+ super(ConvBNLayer, self).__init__()
+ self.is_tweaks_mode = is_tweaks_mode
+ self._pool2d_avg = AvgPool2D(kernel_size=2,
+ stride=2,
+ padding=0,
+ ceil_mode=True)
+ self._conv = Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=(kernel_size - 1) // 2,
+ groups=groups,
+ weight_attr=ParamAttr(name=name + "_weights",
+ learning_rate=lr_mult),
+ bias_attr=False)
+ if name == "conv1":
+ bn_name = "bn_" + name
+ else:
+ bn_name = "bn" + name[3:]
+ self._batch_norm = BatchNorm(
+ out_channels,
+ act=act,
+ param_attr=ParamAttr(name=bn_name + '_scale',
+ learning_rate=lr_mult,
+ regularizer=L2Decay(0.0)),
+ bias_attr=ParamAttr(bn_name + '_offset',
+ learning_rate=lr_mult,
+ regularizer=L2Decay(0.0)),
+ moving_mean_name=bn_name + '_mean',
+ moving_variance_name=bn_name + '_variance')
+
+ def forward(self, inputs):
+ if self.is_tweaks_mode:
+ inputs = self._pool2d_avg(inputs)
+ y = self._conv(inputs)
+ y = self._batch_norm(y)
+ return y
+
+
+class BottleneckBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ if_first=False,
+ lr_mult=1.0,
+ name=None):
+ super(BottleneckBlock, self).__init__()
+
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ act='relu',
+ lr_mult=lr_mult,
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act='relu',
+ lr_mult=lr_mult,
+ name=name + "_branch2b")
+ self.conv2 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ act=None,
+ lr_mult=lr_mult,
+ name=name + "_branch2c")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels * 4,
+ kernel_size=1,
+ stride=1,
+ is_tweaks_mode=False if if_first else True,
+ lr_mult=lr_mult,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+ conv2 = self.conv2(conv1)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv2)
+ y = F.relu(y)
+ return y
+
+
+class BasicBlock(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ shortcut=True,
+ if_first=False,
+ lr_mult=1.0,
+ name=None):
+ super(BasicBlock, self).__init__()
+ self.stride = stride
+ self.conv0 = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=stride,
+ act='relu',
+ lr_mult=lr_mult,
+ name=name + "_branch2a")
+ self.conv1 = ConvBNLayer(in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ act=None,
+ lr_mult=lr_mult,
+ name=name + "_branch2b")
+
+ if not shortcut:
+ self.short = ConvBNLayer(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=1,
+ is_tweaks_mode=False if if_first else True,
+ lr_mult=lr_mult,
+ name=name + "_branch1")
+
+ self.shortcut = shortcut
+
+ def forward(self, inputs):
+ y = self.conv0(inputs)
+ conv1 = self.conv1(y)
+
+ if self.shortcut:
+ short = inputs
+ else:
+ short = self.short(inputs)
+ y = paddle.add(x=short, y=conv1)
+ y = F.relu(y)
+ return y
+
+
+@BACKBONES.register()
+class ResNetTweaksTSN(nn.Layer):
+ """ResNetTweaksTSN backbone.
+
+ Args:
+ depth (int): Depth of resnet model.
+ pretrained (str): pretrained model. Default: None.
+ """
+ def __init__(self,
+ layers=50,
+ pretrained=None,
+ lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):
+ super(ResNetTweaksTSN, self).__init__()
+
+ self.pretrained = pretrained
+ self.layers = layers
+ supported_layers = [18, 34, 50, 101, 152, 200]
+ assert layers in supported_layers, \
+ "supported layers are {} but input layer is {}".format(
+ supported_layers, layers)
+
+ self.lr_mult_list = lr_mult_list
+ assert isinstance(
+ self.lr_mult_list,
+ (list, tuple
+ )), "lr_mult_list should be in (list, tuple) but got {}".format(
+ type(self.lr_mult_list))
+ assert len(
+ self.lr_mult_list
+ ) == 5, "lr_mult_list length should should be 5 but got {}".format(
+ len(self.lr_mult_list))
+
+ if layers == 18:
+ depth = [2, 2, 2, 2]
+ elif layers == 34 or layers == 50:
+ depth = [3, 4, 6, 3]
+ elif layers == 101:
+ depth = [3, 4, 23, 3]
+ elif layers == 152:
+ depth = [3, 8, 36, 3]
+ elif layers == 200:
+ depth = [3, 12, 48, 3]
+ num_channels = [64, 256, 512, 1024
+ ] if layers >= 50 else [64, 64, 128, 256]
+ num_filters = [64, 128, 256, 512]
+
+ self.conv1_1 = ConvBNLayer(in_channels=3,
+ out_channels=32,
+ kernel_size=3,
+ stride=2,
+ act='relu',
+ lr_mult=self.lr_mult_list[0],
+ name="conv1_1")
+ self.conv1_2 = ConvBNLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ act='relu',
+ lr_mult=self.lr_mult_list[0],
+ name="conv1_2")
+ self.conv1_3 = ConvBNLayer(in_channels=32,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ act='relu',
+ lr_mult=self.lr_mult_list[0],
+ name="conv1_3")
+ self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+ self.block_list = []
+ if layers >= 50:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ if layers in [101, 152, 200] and block == 2:
+ if i == 0:
+ conv_name = "res" + str(block + 2) + "a"
+ else:
+ conv_name = "res" + str(block + 2) + "b" + str(i)
+ else:
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ bottleneck_block = self.add_sublayer(
+ 'bb_%d_%d' % (block, i),
+ BottleneckBlock(
+ in_channels=num_channels[block]
+ if i == 0 else num_filters[block] * 4,
+ out_channels=num_filters[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ if_first=block == i == 0,
+ lr_mult=self.lr_mult_list[block + 1],
+ name=conv_name))
+ self.block_list.append(bottleneck_block)
+ shortcut = True
+ else:
+ for block in range(len(depth)):
+ shortcut = False
+ for i in range(depth[block]):
+ conv_name = "res" + str(block + 2) + chr(97 + i)
+ basic_block = self.add_sublayer(
+ 'bb_%d_%d' % (block, i),
+ BasicBlock(in_channels=num_channels[block]
+ if i == 0 else num_filters[block],
+ out_channels=num_filters[block],
+ stride=2 if i == 0 and block != 0 else 1,
+ shortcut=shortcut,
+ if_first=block == i == 0,
+ name=conv_name,
+ lr_mult=self.lr_mult_list[block + 1]))
+ self.block_list.append(basic_block)
+ shortcut = True
+
+ def init_weights(self):
+ """Initiate the parameters.
+ Note:
+ 1. when indicate pretrained loading path, will load it to initiate backbone.
+ 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be
+ initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+ Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+ """
+ # XXX: check bias!!! check pretrained!!!
+
+ if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ # XXX: no bias
+ weight_init_(layer, 'KaimingNormal')
+ elif isinstance(layer, nn.BatchNorm2D):
+ weight_init_(layer, 'Constant', value=1)
+
+ def forward(self, inputs):
+ y = self.conv1_1(inputs)
+ y = self.conv1_2(y)
+ y = self.conv1_3(y)
+ y = self.pool2d_max(y)
+ for block in self.block_list:
+ y = block(y)
+ return y
diff --git a/paddlevideo/modeling/backbones/stgcn.py b/paddlevideo/modeling/backbones/stgcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..40d9d0ddaca73cbcf618eaef5778e88f257f1227
--- /dev/null
+++ b/paddlevideo/modeling/backbones/stgcn.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def zero(x):
+ return 0
+
+
+def iden(x):
+ return x
+
+
+def einsum(x, A):
+ """paddle.einsum will be implemented in release/2.2.
+ """
+ x = x.transpose((0, 2, 3, 1, 4))
+ n, c, t, k, v = x.shape
+ k2, v2, w = A.shape
+ assert (k == k2 and v == v2), "Args of einsum not match!"
+ x = x.reshape((n, c, t, k * v))
+ A = A.reshape((k * v, w))
+ y = paddle.matmul(x, A)
+ return y
+
+
+def get_hop_distance(num_node, edge, max_hop=1):
+ A = np.zeros((num_node, num_node))
+ for i, j in edge:
+ A[j, i] = 1
+ A[i, j] = 1
+
+ # compute hop steps
+ hop_dis = np.zeros((num_node, num_node)) + np.inf
+ transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+ arrive_mat = (np.stack(transfer_mat) > 0)
+ for d in range(max_hop, -1, -1):
+ hop_dis[arrive_mat[d]] = d
+ return hop_dis
+
+
+def normalize_digraph(A):
+ Dl = np.sum(A, 0)
+ num_node = A.shape[0]
+ Dn = np.zeros((num_node, num_node))
+ for i in range(num_node):
+ if Dl[i] > 0:
+ Dn[i, i] = Dl[i]**(-1)
+ AD = np.dot(A, Dn)
+ return AD
+
+
+class Graph():
+
+ def __init__(self,
+ layout='openpose',
+ strategy='uniform',
+ max_hop=1,
+ dilation=1):
+ self.max_hop = max_hop
+ self.dilation = dilation
+
+ self.get_edge(layout)
+ self.hop_dis = get_hop_distance(self.num_node,
+ self.edge,
+ max_hop=max_hop)
+ self.get_adjacency(strategy)
+
+ def __str__(self):
+ return self.A
+
+ def get_edge(self, layout):
+ # edge is a list of [child, parent] paris
+
+ if layout == 'fsd10':
+ self.num_node = 25
+ self_link = [(i, i) for i in range(self.num_node)]
+ neighbor_link = [(1, 8), (0, 1), (15, 0), (17, 15), (16, 0),
+ (18, 16), (5, 1), (6, 5), (7, 6), (2, 1), (3, 2),
+ (4, 3), (9, 8), (10, 9), (11, 10), (24, 11),
+ (22, 11), (23, 22), (12, 8), (13, 12), (14, 13),
+ (21, 14), (19, 14), (20, 19)]
+ self.edge = self_link + neighbor_link
+ self.center = 8
+ elif layout == 'ntu-rgb+d':
+ self.num_node = 25
+ self_link = [(i, i) for i in range(self.num_node)]
+ neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+ (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+ (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+ (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+ (23, 8), (24, 25), (25, 12)]
+ neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+ self.edge = self_link + neighbor_link
+ self.center = 21 - 1
+ elif layout == 'coco_keypoint':
+ self.num_node = 17
+ self_link = [(i, i) for i in range(self.num_node)]
+ neighbor_1base = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6),
+ (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12),
+ (11, 13), (12, 14), (13, 15), (14, 16), (11, 12)]
+ neighbor_link = [(i, j) for (i, j) in neighbor_1base]
+ self.edge = self_link + neighbor_link
+ self.center = 11
+ else:
+ raise ValueError("Do Not Exist This Layout.")
+
+ def get_adjacency(self, strategy):
+ valid_hop = range(0, self.max_hop + 1, self.dilation)
+ adjacency = np.zeros((self.num_node, self.num_node))
+ for hop in valid_hop:
+ adjacency[self.hop_dis == hop] = 1
+ normalize_adjacency = normalize_digraph(adjacency)
+
+ if strategy == 'spatial':
+ A = []
+ for hop in valid_hop:
+ a_root = np.zeros((self.num_node, self.num_node))
+ a_close = np.zeros((self.num_node, self.num_node))
+ a_further = np.zeros((self.num_node, self.num_node))
+ for i in range(self.num_node):
+ for j in range(self.num_node):
+ if self.hop_dis[j, i] == hop:
+ if self.hop_dis[j, self.center] == self.hop_dis[
+ i, self.center]:
+ a_root[j, i] = normalize_adjacency[j, i]
+ elif self.hop_dis[j, self.center] > self.hop_dis[
+ i, self.center]:
+ a_close[j, i] = normalize_adjacency[j, i]
+ else:
+ a_further[j, i] = normalize_adjacency[j, i]
+ if hop == 0:
+ A.append(a_root)
+ else:
+ A.append(a_root + a_close)
+ A.append(a_further)
+ A = np.stack(A)
+ self.A = A
+ else:
+ raise ValueError("Do Not Exist This Strategy")
+
+
+class ConvTemporalGraphical(nn.Layer):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ t_kernel_size=1,
+ t_stride=1,
+ t_padding=0,
+ t_dilation=1):
+ super().__init__()
+
+ self.kernel_size = kernel_size
+ self.conv = nn.Conv2D(in_channels,
+ out_channels * kernel_size,
+ kernel_size=(t_kernel_size, 1),
+ padding=(t_padding, 0),
+ stride=(t_stride, 1),
+ dilation=(t_dilation, 1))
+
+ def forward(self, x, A):
+ assert A.shape[0] == self.kernel_size
+
+ x = self.conv(x)
+ n, kc, t, v = x.shape
+ x = x.reshape((n, self.kernel_size, kc // self.kernel_size, t, v))
+ x = einsum(x, A)
+
+ return x, A
+
+
+class st_gcn_block(nn.Layer):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ dropout=0,
+ residual=True):
+ super(st_gcn_block, self).__init__()
+
+ assert len(kernel_size) == 2
+ assert kernel_size[0] % 2 == 1
+ padding = ((kernel_size[0] - 1) // 2, 0)
+
+ self.gcn = ConvTemporalGraphical(in_channels, out_channels,
+ kernel_size[1])
+
+ self.tcn = nn.Sequential(
+ nn.BatchNorm2D(out_channels),
+ nn.ReLU(),
+ nn.Conv2D(
+ out_channels,
+ out_channels,
+ (kernel_size[0], 1),
+ (stride, 1),
+ padding,
+ ),
+ nn.BatchNorm2D(out_channels),
+ nn.Dropout(dropout),
+ )
+
+ if not residual:
+ self.residual = zero
+
+ elif (in_channels == out_channels) and (stride == 1):
+ self.residual = iden
+
+ else:
+ self.residual = nn.Sequential(
+ nn.Conv2D(in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=(stride, 1)),
+ nn.BatchNorm2D(out_channels),
+ )
+
+ self.relu = nn.ReLU()
+
+ def forward(self, x, A):
+ res = self.residual(x)
+ x, A = self.gcn(x, A)
+ x = self.tcn(x) + res
+ return self.relu(x), A
+
+
+@BACKBONES.register()
+class STGCN(nn.Layer):
+ """
+ ST-GCN model from:
+ `"Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition" `_
+ Args:
+ in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.
+ edge_importance_weighting: bool, whether to use edge attention. Default True.
+ data_bn: bool, whether to use data BatchNorm. Default True.
+ """
+
+ def __init__(self,
+ in_channels=2,
+ edge_importance_weighting=True,
+ data_bn=True,
+ layout='fsd10',
+ strategy='spatial',
+ **kwargs):
+ super(STGCN, self).__init__()
+ self.data_bn = data_bn
+ # load graph
+ self.graph = Graph(
+ layout=layout,
+ strategy=strategy,
+ )
+ A = paddle.to_tensor(self.graph.A, dtype='float32')
+ self.register_buffer('A', A)
+
+ # build networks
+ spatial_kernel_size = A.shape[0]
+ temporal_kernel_size = 9
+ kernel_size = (temporal_kernel_size, spatial_kernel_size)
+ self.data_bn = nn.BatchNorm1D(in_channels *
+ A.shape[1]) if self.data_bn else iden
+ kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
+ self.st_gcn_networks = nn.LayerList((
+ st_gcn_block(in_channels,
+ 64,
+ kernel_size,
+ 1,
+ residual=False,
+ **kwargs0),
+ st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+ st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+ st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+ st_gcn_block(64, 128, kernel_size, 2, **kwargs),
+ st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+ st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+ st_gcn_block(128, 256, kernel_size, 2, **kwargs),
+ st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+ st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+ ))
+
+ # initialize parameters for edge importance weighting
+ if edge_importance_weighting:
+ self.edge_importance = nn.ParameterList([
+ self.create_parameter(
+ shape=self.A.shape,
+ default_initializer=nn.initializer.Constant(1))
+ for i in self.st_gcn_networks
+ ])
+ else:
+ self.edge_importance = [1] * len(self.st_gcn_networks)
+
+ self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))
+
+ def init_weights(self):
+ """Initiate the parameters.
+ """
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ weight_init_(layer, 'Normal', mean=0.0, std=0.02)
+ elif isinstance(layer, nn.BatchNorm2D):
+ weight_init_(layer, 'Normal', mean=1.0, std=0.02)
+ elif isinstance(layer, nn.BatchNorm1D):
+ weight_init_(layer, 'Normal', mean=1.0, std=0.02)
+
+ def forward(self, x):
+ # data normalization
+ N, C, T, V, M = x.shape
+ x = x.transpose((0, 4, 3, 1, 2)) # N, M, V, C, T
+ x = x.reshape((N * M, V * C, T))
+ if self.data_bn:
+ x.stop_gradient = False
+ x = self.data_bn(x)
+ x = x.reshape((N, M, V, C, T))
+ x = x.transpose((0, 1, 3, 4, 2)) # N, M, C, T, V
+ x = x.reshape((N * M, C, T, V))
+
+ # forward
+ for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
+ x, _ = gcn(x, paddle.multiply(self.A, importance))
+
+ x = self.pool(x) # NM,C,T,V --> NM,C,1,1
+ C = x.shape[1]
+ x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1) # N,C,1,1
+ return x
diff --git a/paddlevideo/modeling/backbones/swin_transformer.py b/paddlevideo/modeling/backbones/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaed21790919bc45a25217c3df377c81cefdb89b
--- /dev/null
+++ b/paddlevideo/modeling/backbones/swin_transformer.py
@@ -0,0 +1,742 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache, reduce
+from operator import mul
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ # issuecomment-532968956 ...
+ See discussion: https://github.com/tensorflow/tpu/issues/494
+ """
+ if drop_prob == 0. or not training:
+ return x
+ keep_prob = paddle.to_tensor(1 - drop_prob)
+ shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+ random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+ random_tensor = paddle.floor(random_tensor) # binarize
+ output = x.divide(keep_prob) * random_tensor
+
+ return output
+
+
+class DropPath(nn.Layer):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+ """ Multilayer perceptron."""
+ def __init__(self,
+ in_features,
+ hidden_features=None,
+ out_features=None,
+ act_layer=nn.GELU,
+ drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+def window_partition(x, window_size):
+ """window_partition
+ Args:
+ x (Tensor): x.shape = [B, D, H, W, C]
+ window_size (tuple[int]): window_size
+
+ Returns:
+ Tensor: (B*num_windows, window_size*window_size, C)
+ """
+ B, D, H, W, C = x.shape
+ x = x.reshape([
+ B, D // window_size[0], window_size[0], H // window_size[1],
+ window_size[1], W // window_size[2], window_size[2], C
+ ])
+ windows = x.transpose([0, 1, 3, 5, 2, 4, 6,
+ 7]).reshape([-1, reduce(mul, window_size), C])
+ return windows
+
+
+class Identity(nn.Layer):
+ def __init__(self):
+ super(Identity, self).__init__()
+
+ def forward(self, input):
+ return input
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+ """
+ Args:
+ windows: (B*num_windows, window_size, window_size, C)
+ window_size (tuple[int]): Window size
+ H (int): Height of image
+ W (int): Width of image
+
+ Returns:
+ x: (B, D, H, W, C)
+ """
+ x = windows.reshape([
+ B, D // window_size[0], H // window_size[1], W // window_size[2],
+ window_size[0], window_size[1], window_size[2], -1
+ ])
+ x = x.transpose([0, 1, 4, 2, 5, 3, 6, 7]).reshape([B, D, H, W, -1])
+ return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+ use_window_size = list(window_size)
+ if shift_size is not None:
+ use_shift_size = list(shift_size)
+ for i in range(len(x_size)):
+ if x_size[i] <= window_size[i]:
+ use_window_size[i] = x_size[i]
+ if shift_size is not None:
+ use_shift_size[i] = 0
+
+ if shift_size is None:
+ return tuple(use_window_size)
+ else:
+ return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Layer):
+ """ Window based multi-head self attention (W-MSA) module with relative position bias.
+ It supports both of shifted and non-shifted window.
+ Args:
+ dim (int): Number of input channels.
+ window_size (tuple[int]): The temporal length, height and width of the window.
+ num_heads (int): Number of attention heads.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+ """
+ def __init__(self,
+ dim,
+ window_size,
+ num_heads,
+ qkv_bias=False,
+ qk_scale=None,
+ attn_drop=0.,
+ proj_drop=0.):
+
+ super().__init__()
+ self.dim = dim
+ self.window_size = window_size # Wd, Wh, Ww
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim**-0.5
+
+ # define a parameter table of relative position bias
+ self.relative_position_bias_table = self.create_parameter(
+ shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1) *
+ (2 * window_size[2] - 1), num_heads),
+ default_initializer=zeros_,
+ ) # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+ self.add_parameter("relative_position_bias_table",
+ self.relative_position_bias_table)
+ # get pair-wise relative position index for each token inside the window
+ coords_d = paddle.arange(self.window_size[0])
+ coords_h = paddle.arange(self.window_size[1])
+ coords_w = paddle.arange(self.window_size[2])
+ coords = paddle.stack(paddle.meshgrid(coords_d, coords_h,
+ coords_w)) # 3, Wd, Wh, Ww
+ coords_flatten = paddle.flatten(coords, 1) # 3, Wd*Wh*Ww
+
+ relative_coords = coords_flatten.unsqueeze(
+ axis=2) - coords_flatten.unsqueeze(axis=1) # 3, Wd*Wh*Ww, Wd*Wh*Ww
+
+ # relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1) # 3, Wd*Wh*Ww, Wd*Wh*Ww
+ relative_coords = relative_coords.transpose([1, 2, 0
+ ]) # Wd*Wh*Ww, Wd*Wh*Ww, 3
+ relative_coords[:, :,
+ 0] += self.window_size[0] - 1 # shift to start from 0
+ relative_coords[:, :, 1] += self.window_size[1] - 1
+ relative_coords[:, :, 2] += self.window_size[2] - 1
+
+ relative_coords[:, :, 0] *= (2 * self.window_size[1] -
+ 1) * (2 * self.window_size[2] - 1)
+ relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+ relative_position_index = relative_coords.sum(
+ axis=-1) # Wd*Wh*Ww, Wd*Wh*Ww
+ self.register_buffer("relative_position_index", relative_position_index)
+
+ self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ trunc_normal_(self.relative_position_bias_table, std=0.02)
+ self.softmax = nn.Softmax(axis=-1)
+
+ def forward(self, x, mask=None):
+ """ Forward function.
+ Args:
+ x: input features with shape of (num_windows*B, N, C)
+ mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+ """
+ B_, N, C = x.shape
+ qkv = self.qkv(x).reshape(
+ [B_, N, 3, self.num_heads,
+ C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+ q, k, v = qkv[0], qkv[1], qkv[2] # B_, nH, N, C
+
+ q = q * self.scale
+ attn = q @ k.transpose([0, 1, 3, 2])
+
+ relative_position_bias = self.relative_position_bias_table[
+ self.relative_position_index[:N, :N].reshape([-1])].reshape(
+ [N, N, -1]) # Wd*Wh*Ww,Wd*Wh*Ww,nH
+ relative_position_bias = relative_position_bias.transpose(
+ [2, 0, 1]) # nH, Wd*Wh*Ww, Wd*Wh*Ww
+ attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N
+
+ if mask is not None:
+ nW = mask.shape[0]
+ attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+ ]) + mask.unsqueeze(1).unsqueeze(0)
+ attn = attn.reshape([-1, self.num_heads, N, N])
+ attn = self.softmax(attn)
+ else:
+ attn = self.softmax(attn)
+
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class SwinTransformerBlock3D(nn.Layer):
+ """ Swin Transformer Block.
+
+ Args:
+ dim (int): Number of input channels.
+ num_heads (int): Number of attention heads.
+ window_size (tuple[int]): Window size.
+ shift_size (tuple[int]): Shift size for SW-MSA.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+ drop (float, optional): Dropout rate. Default: 0.0
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
+ act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+ norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+ """
+ def __init__(self,
+ dim,
+ num_heads,
+ window_size=(2, 7, 7),
+ shift_size=(0, 0, 0),
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ drop=0.,
+ attn_drop=0.,
+ drop_path=0.,
+ act_layer=nn.GELU,
+ norm_layer=nn.LayerNorm,
+ use_checkpoint=False):
+ super().__init__()
+ self.dim = dim
+ self.num_heads = num_heads
+ self.window_size = window_size
+ self.shift_size = shift_size
+ self.mlp_ratio = mlp_ratio
+ # self.use_checkpoint=use_checkpoint
+
+ assert 0 <= self.shift_size[0] < self.window_size[
+ 0], "shift_size must in 0-window_size"
+ assert 0 <= self.shift_size[1] < self.window_size[
+ 1], "shift_size must in 0-window_size"
+ assert 0 <= self.shift_size[2] < self.window_size[
+ 2], "shift_size must in 0-window_size"
+
+ self.norm1 = norm_layer(dim)
+ self.attn = WindowAttention3D(dim,
+ window_size=self.window_size,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ attn_drop=attn_drop,
+ proj_drop=drop)
+
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim,
+ hidden_features=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=drop)
+
+ def forward_part1(self, x, mask_matrix):
+ B = paddle.shape(x)[0]
+ _, D, H, W, C = x.shape
+ window_size, shift_size = get_window_size((D, H, W), self.window_size,
+ self.shift_size)
+
+ x = self.norm1(x)
+ # pad feature maps to multiples of window size
+ pad_l = pad_t = pad_d0 = 0
+ pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+ pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+ pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+ x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1),
+ data_format='NDHWC')
+ _, Dp, Hp, Wp, _ = x.shape
+ # cyclic shift
+ if any(i > 0 for i in shift_size):
+ shifted_x = paddle.roll(x,
+ shifts=(-shift_size[0], -shift_size[1],
+ -shift_size[2]),
+ axis=(1, 2, 3))
+ attn_mask = mask_matrix
+ else:
+ shifted_x = x
+ attn_mask = None
+ # partition windows
+ x_windows = window_partition(shifted_x,
+ window_size) # B*nW, Wd*Wh*Ww, C
+ # W-MSA/SW-MSA
+ attn_windows = self.attn(x_windows, mask=attn_mask) # B*nW, Wd*Wh*Ww, C
+ # merge windows
+ attn_windows = attn_windows.reshape([-1, *(window_size + (C, ))])
+ shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+ Wp) # B D' H' W' C
+ # reverse cyclic shift
+ if any(i > 0 for i in shift_size):
+ x = paddle.roll(shifted_x,
+ shifts=(shift_size[0], shift_size[1],
+ shift_size[2]),
+ axis=(1, 2, 3))
+ else:
+ x = shifted_x
+
+ if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+ x = x[:, :D, :H, :W, :]
+ return x
+
+ def forward_part2(self, x):
+ return self.drop_path(self.mlp(self.norm2(x)))
+
+ def forward(self, x, mask_matrix):
+ """ Forward function.
+
+ Args:
+ x: Input feature, tensor size (B, D, H, W, C).
+ mask_matrix: Attention mask for cyclic shift.
+ """
+
+ shortcut = x
+ x = self.forward_part1(x, mask_matrix)
+ x = shortcut + self.drop_path(x)
+ x = x + self.forward_part2(x)
+
+ return x
+
+
+class PatchMerging(nn.Layer):
+ """ Patch Merging Layer
+
+ Args:
+ dim (int): Number of input channels.
+ norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+ """
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.dim = dim
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+ self.norm = norm_layer(4 * dim)
+
+ def forward(self, x):
+ """ Forward function.
+
+ Args:
+ x: Input feature, tensor size (B, D, H, W, C).
+ """
+ B, D, H, W, C = x.shape
+
+ # padding
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
+ if pad_input:
+ x = F.pad(x, (0, W % 2, 0, H % 2, 0, 0), data_format='NDHWC')
+
+ x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C
+ x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C
+ x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C
+ x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C
+ x = paddle.concat([x0, x1, x2, x3], -1) # B D H/2 W/2 4*C
+
+ x = self.norm(x)
+ x = self.reduction(x)
+
+ return x
+
+
+# cache each stage results
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size):
+ img_mask = paddle.zeros((1, D, H, W, 1)) # 1 Dp Hp Wp 1
+ cnt = 0
+ for d in slice(-window_size[0]), slice(-window_size[0],
+ -shift_size[0]), slice(
+ -shift_size[0], None):
+ for h in slice(-window_size[1]), slice(-window_size[1],
+ -shift_size[1]), slice(
+ -shift_size[1], None):
+ for w in slice(-window_size[2]), slice(-window_size[2],
+ -shift_size[2]), slice(
+ -shift_size[2], None):
+ img_mask[:, d, h, w, :] = cnt
+ cnt += 1
+ mask_windows = window_partition(img_mask,
+ window_size) # nW, ws[0]*ws[1]*ws[2], 1
+ mask_windows = mask_windows.squeeze(-1) # nW, ws[0]*ws[1]*ws[2]
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+ # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+ huns = -100.0 * paddle.ones_like(attn_mask)
+ attn_mask = huns * (attn_mask != 0).astype("float32")
+ return attn_mask
+
+
+class BasicLayer(nn.Layer):
+ """ A basic Swin Transformer layer for one stage.
+
+ Args:
+ dim (int): Number of feature channels
+ depth (int): Depths of this stage.
+ num_heads (int): Number of attention head.
+ window_size (tuple[int]): Local window size. Default: (1,7,7).
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+ drop (float, optional): Dropout rate. Default: 0.0
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+ norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+ downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+ """
+ def __init__(self,
+ dim,
+ depth,
+ num_heads,
+ window_size=(1, 7, 7),
+ mlp_ratio=4.,
+ qkv_bias=False,
+ qk_scale=None,
+ drop=0.,
+ attn_drop=0.,
+ drop_path=0.,
+ norm_layer=nn.LayerNorm,
+ downsample=None,
+ use_checkpoint=False):
+ super().__init__()
+ self.window_size = window_size
+ self.shift_size = tuple(i // 2 for i in window_size)
+ self.depth = depth
+ self.use_checkpoint = use_checkpoint
+
+ # build blocks
+ self.blocks = nn.LayerList([
+ SwinTransformerBlock3D(
+ dim=dim,
+ num_heads=num_heads,
+ window_size=window_size,
+ shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop,
+ attn_drop=attn_drop,
+ drop_path=drop_path[i]
+ if isinstance(drop_path, list) else drop_path,
+ norm_layer=norm_layer,
+ use_checkpoint=use_checkpoint,
+ ) for i in range(depth)
+ ])
+
+ self.downsample = downsample
+ if self.downsample is not None:
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+ def forward(self, x):
+ """ Forward function.
+
+ Args:
+ x: Input feature, tensor size (B, C, D, H, W).
+ """
+ # calculate attention mask for SW-MSA
+ B = paddle.shape(x)[0]
+ _, C, D, H, W = x.shape
+ window_size, shift_size = get_window_size((D, H, W), self.window_size,
+ self.shift_size)
+ # x = rearrange(x, 'b c d h w -> b d h w c')
+ x = x.transpose([0, 2, 3, 4, 1])
+ Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+ Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+ Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+ attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size)
+ for blk in self.blocks:
+ x = blk(x, attn_mask)
+ x = x.reshape([B, D, H, W, C])
+
+ if self.downsample is not None:
+ x = self.downsample(x)
+ x = x.transpose([0, 4, 1, 2, 3])
+ return x
+
+
+class PatchEmbed3D(nn.Layer):
+ """ Video to Patch Embedding.
+
+ Args:
+ patch_size (int): Patch token size. Default: (2,4,4).
+ in_chans (int): Number of input video channels. Default: 3.
+ embed_dim (int): Number of linear projection output channels. Default: 96.
+ norm_layer (nn.Layer, optional): Normalization layer. Default: None
+ """
+ def __init__(self,
+ patch_size=(2, 4, 4),
+ in_chans=3,
+ embed_dim=96,
+ norm_layer=None):
+ super().__init__()
+ self.patch_size = patch_size
+
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+
+ self.proj = nn.Conv3D(in_chans,
+ embed_dim,
+ kernel_size=patch_size,
+ stride=patch_size)
+ if norm_layer is not None:
+ self.norm = norm_layer(embed_dim)
+ else:
+ self.norm = None
+
+ def forward(self, x):
+ _, _, D, H, W = x.shape
+ if W % self.patch_size[2] != 0:
+ x = F.pad(
+ x, (0, self.patch_size[2] - W % self.patch_size[2], 0, 0, 0, 0),
+ data_format='NCDHW')
+ if H % self.patch_size[1] != 0:
+ x = F.pad(
+ x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1], 0, 0),
+ data_format='NCDHW')
+ if D % self.patch_size[0] != 0:
+ x = F.pad(
+ x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]),
+ data_format='NCDHW')
+
+ x = self.proj(x) # B C D Wh Ww
+ if self.norm is not None:
+ D, Wh, Ww = x.shape[2], x.shape[3], x.shape[4]
+ x = x.flatten(2).transpose([0, 2, 1])
+ x = self.norm(x)
+ x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, D, Wh, Ww])
+
+ return x
+
+
+@BACKBONES.register()
+class SwinTransformer3D(nn.Layer):
+ """ Swin Transformer backbone.
+ A Paddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
+ https://arxiv.org/pdf/2103.14030
+
+ Args:
+ patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+ in_chans (int): Number of input image channels. Default: 3.
+ embed_dim (int): Number of linear projection output channels. Default: 96.
+ depths (tuple[int]): Depths of each Swin Transformer stage.
+ num_heads (tuple[int]): Number of attention head of each stage.
+ window_size (int): Window size. Default: 7.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+ drop_rate (float): Dropout rate.
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+ norm_layer: Normalization layer. Default: nn.LayerNorm.
+ patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+ -1 means not freezing any parameters.
+ """
+ def __init__(self,
+ pretrained=None,
+ patch_size=(4, 4, 4),
+ in_chans=3,
+ embed_dim=96,
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 24],
+ window_size=(2, 7, 7),
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.2,
+ norm_layer=nn.LayerNorm,
+ patch_norm=False,
+ frozen_stages=-1,
+ use_checkpoint=False):
+ super().__init__()
+
+ self.pretrained = pretrained
+ self.num_layers = len(depths)
+ self.embed_dim = embed_dim
+ self.patch_norm = patch_norm
+ self.frozen_stages = frozen_stages
+ self.window_size = window_size
+ self.patch_size = patch_size
+
+ # split image into non-overlapping patches
+ self.patch_embed = PatchEmbed3D(
+ patch_size=patch_size,
+ in_chans=in_chans,
+ embed_dim=embed_dim,
+ norm_layer=norm_layer if self.patch_norm else None)
+
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ # stochastic depth
+ dpr = [
+ x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+ ] # stochastic depth decay rule
+
+ # build layers
+ self.layers = nn.LayerList()
+ for i_layer in range(self.num_layers):
+ layer = BasicLayer(
+ dim=int(embed_dim * 2**i_layer),
+ depth=depths[i_layer],
+ num_heads=num_heads[i_layer],
+ window_size=window_size,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate,
+ attn_drop=attn_drop_rate,
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+ norm_layer=norm_layer,
+ downsample=PatchMerging
+ if i_layer < self.num_layers - 1 else None,
+ use_checkpoint=use_checkpoint)
+ self.layers.append(layer)
+
+ self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+
+ # add a norm layer for each output
+ self.norm = norm_layer(self.num_features)
+
+ self._freeze_stages()
+
+ def _freeze_stages(self):
+ if self.frozen_stages >= 0:
+ self.patch_embed.eval()
+ for param in self.patch_embed.parameters():
+ param.stop_gradient = True
+
+ if self.frozen_stages >= 1:
+ self.pos_drop.eval()
+ for i in range(0, self.frozen_stages):
+ m = self.layers[i]
+ m.eval()
+ for param in m.parameters():
+ param.stop_gradient = True
+
+ def _init_fn(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=0.02)
+ if m.bias is not None:
+ zeros_(m.bias)
+ elif isinstance(m, nn.LayerNorm):
+ zeros_(m.bias)
+ ones_(m.weight)
+
+ def init_weights(self):
+ """Initialize the weights in backbone.
+
+ Args:
+ pretrained (str, optional): Path to pre-trained weights.
+ Defaults to None.
+ """
+ """First init model's weight"""
+
+ self.apply(self._init_fn)
+ """Second, if provide pretrained ckpt, load it"""
+ if isinstance(
+ self.pretrained, str
+ ) and self.pretrained.strip() != "": # load pretrained weights
+ load_ckpt(self, self.pretrained)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ pass
+ else:
+ raise NotImplementedError
+
+ def forward(self, x):
+ """Forward function."""
+ x = self.patch_embed(x)
+ x = self.pos_drop(x)
+
+ for layer in self.layers:
+ x = layer(x)
+
+ x = x.transpose([0, 2, 3, 4, 1])
+ x = self.norm(x)
+ x = x.transpose([0, 4, 1, 2, 3])
+ return x
+
+ def train(self, mode=True):
+ """Convert the model into training mode while keep layers freezed."""
+ super(SwinTransformer3D, self).train(mode)
+ self._freeze_stages()
diff --git a/paddlevideo/modeling/backbones/transnetv2.py b/paddlevideo/modeling/backbones/transnetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..60603e2c9a544258a93a532b1e46c8bfb3a3b441
--- /dev/null
+++ b/paddlevideo/modeling/backbones/transnetv2.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as functional
+import random
+from paddle import ParamAttr
+
+from ..registry import BACKBONES
+
+
+class OctConv3D(nn.Layer):
+ def __init__(self, in_filters, filters, kernel_size=3, dilation_rate=(1, 1, 1), alpha=0.25,
+ use_bias=True, kernel_initializer=nn.initializer.KaimingNormal()):
+ super(OctConv3D, self).__init__()
+
+ self.low_channels = int(filters * alpha)
+ self.high_channels = filters - self.low_channels
+
+ self.high_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,
+ dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+ weight_attr=ParamAttr(initializer=kernel_initializer),
+ bias_attr=ParamAttr(
+ initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+ self.high_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,
+ dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+ weight_attr=ParamAttr(initializer=kernel_initializer),
+ bias_attr=False)
+ self.low_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,
+ dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+ weight_attr=ParamAttr(initializer=kernel_initializer),
+ bias_attr=False)
+ self.low_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,
+ dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+ weight_attr=ParamAttr(initializer=kernel_initializer),
+ bias_attr=ParamAttr(
+ initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+ self.upsampler = nn.Upsample(size=(1, 2, 2), data_format='NCDHW')
+ self.downsampler = nn.AvgPool3D(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0, 1, 1))
+
+ @staticmethod
+ def pad_to(tensor, target_shape):
+ shape = tensor.shape
+ padding = [[0, tar - curr] for curr, tar in zip(shape, target_shape)]
+ return functional.pad(tensor, padding, "CONSTANT", data_format='NCDHW')
+
+ @staticmethod
+ def crop_to(tensor, target_width, target_height):
+ return tensor[:, :, :target_height, :target_width]
+
+ def forward(self, inputs):
+ low_inputs, high_inputs = inputs
+
+ high_to_high = self.high_to_high(high_inputs)
+ high_to_low = self.high_to_low(self.downsampler(high_inputs))
+
+ low_to_high = self.upsampler(self.low_to_high(low_inputs))
+ low_to_low = self.low_to_low(low_inputs)
+
+ high_output = high_to_high[:, :, :, :low_to_high.shape[3], :low_to_high.shape[4]] + low_to_high
+ low_output = low_to_low + high_to_low[:, :, :, :low_to_low.shape[3], :low_to_low.shape[4]]
+
+ return low_output, high_output
+
+
+class Conv3DConfigurable(nn.Layer):
+ def __init__(self,
+ in_filters,
+ filters,
+ dilation_rate,
+ separable=True,
+ octave=False,
+ use_bias=True):
+ super(Conv3DConfigurable, self).__init__()
+ assert not (separable and octave)
+
+ if separable:
+ conv1 = nn.Conv3D(in_filters, 2 * filters, kernel_size=(1, 3, 3),
+ dilation=(1, 1, 1), padding=(0, 1, 1),
+ weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+ bias_attr=False)
+ conv2 = nn.Conv3D(2 * filters, filters, kernel_size=(3, 1, 1),
+ dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0),
+ weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+ bias_attr=ParamAttr(
+ initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+ self.layers = nn.LayerList([conv1, conv2])
+ elif octave:
+ conv = OctConv3D(in_filters, filters, kernel_size=3, dilation_rate=(dilation_rate, 1, 1),
+ use_bias=use_bias,
+ kernel_initializer=nn.initializer.KaimingNormal())
+ self.layers = [conv]
+ else:
+ conv = nn.Conv3D(in_filters, filters, kernel_size=3,
+ dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1),
+ weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+ bias_attr=ParamAttr(
+ initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+ self.layers = nn.LayerList([conv])
+
+ def forward(self, inputs):
+ x = inputs
+ for layer in self.layers:
+ x = layer(x)
+ return x
+
+
+class DilatedDCNNV2(nn.Layer):
+ def __init__(self,
+ in_filters,
+ filters,
+ batch_norm=True,
+ activation=None,
+ octave_conv=False):
+ super(DilatedDCNNV2, self).__init__()
+ assert not (octave_conv and batch_norm)
+
+ self.Conv3D_1 = Conv3DConfigurable(in_filters, filters, 1, use_bias=not batch_norm, octave=octave_conv)
+ self.Conv3D_2 = Conv3DConfigurable(in_filters, filters, 2, use_bias=not batch_norm, octave=octave_conv)
+ self.Conv3D_4 = Conv3DConfigurable(in_filters, filters, 4, use_bias=not batch_norm, octave=octave_conv)
+ self.Conv3D_8 = Conv3DConfigurable(in_filters, filters, 8, use_bias=not batch_norm, octave=octave_conv)
+ self.octave = octave_conv
+
+ self.bn = nn.BatchNorm3D(filters * 4, momentum=0.99, epsilon=1e-03,
+ weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+ ) if batch_norm else None
+ self.activation = activation
+
+ def forward(self, inputs):
+ conv1 = self.Conv3D_1(inputs)
+ conv2 = self.Conv3D_2(inputs)
+ conv3 = self.Conv3D_4(inputs)
+ conv4 = self.Conv3D_8(inputs)
+
+ # shape of convi[j]/convi is [B, 3, T, H, W], concat in channel dimension
+ if self.octave:
+ x = [paddle.concat([conv1[0], conv2[0], conv3[0], conv4[0]], axis=1),
+ paddle.concat([conv1[1], conv2[1], conv3[1], conv4[1]], axis=1)]
+ else:
+ x = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+
+ if self.bn is not None:
+ x = self.bn(x)
+
+ if self.activation is not None:
+ if self.octave:
+ x = [self.activation(x[0]), self.activation(x[1])]
+ else:
+ x = self.activation(x)
+ return x
+
+
+class StackedDDCNNV2(nn.Layer):
+ def __init__(self,
+ in_filters,
+ n_blocks,
+ filters,
+ shortcut=True,
+ use_octave_conv=False,
+ pool_type="avg",
+ stochastic_depth_drop_prob=0.0):
+ super(StackedDDCNNV2, self).__init__()
+ assert pool_type == "max" or pool_type == "avg"
+ if use_octave_conv and pool_type == "max":
+ print("WARN: Octave convolution was designed with average pooling, not max pooling.")
+
+ self.shortcut = shortcut
+ self.DDCNN = nn.LayerList([
+ DilatedDCNNV2(in_filters if i == 1 else filters * 4, filters, octave_conv=use_octave_conv,
+ activation=functional.relu if i != n_blocks else None) for i in range(1, n_blocks + 1)
+ ])
+ self.pool = nn.MaxPool3D(kernel_size=(1, 2, 2)) if pool_type == "max" else nn.AvgPool3D(kernel_size=(1, 2, 2))
+ self.octave = use_octave_conv
+ self.stochastic_depth_drop_prob = stochastic_depth_drop_prob
+
+ def forward(self, inputs):
+ x = inputs
+ shortcut = None
+
+ if self.octave:
+ x = [self.pool(x), x]
+ for block in self.DDCNN:
+ x = block(x)
+ if shortcut is None:
+ shortcut = x
+ # shape of x[i] is [B, 3, T, H, W], concat in channel dimension
+ if self.octave:
+ x = paddle.concat([x[0], self.pool(x[1])], axis=1)
+
+ x = functional.relu(x)
+
+ if self.shortcut is not None:
+ if self.stochastic_depth_drop_prob != 0.:
+ if self.training:
+ if random.random() < self.stochastic_depth_drop_prob:
+ x = shortcut
+ else:
+ x = x + shortcut
+ else:
+ x = (1 - self.stochastic_depth_drop_prob) * x + shortcut
+ else:
+ x += shortcut
+
+ if not self.octave:
+ x = self.pool(x)
+ return x
+
+
+class ResNetBlock(nn.Layer):
+ def __init__(self, in_filters, filters, strides=(1, 1)):
+ super(ResNetBlock, self).__init__()
+
+ self.conv1 = nn.Conv2D(in_filters, filters, kernel_size=(3, 3), stride=strides, padding=(1, 1),
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=False)
+ self.bn1 = nn.BatchNorm2D(filters,
+ weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+ self.conv2 = nn.Conv2D(filters, filters, kernel_size=(3, 3), padding=(1, 1),
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=False)
+ self.bn2 = nn.BatchNorm2D(filters,
+ weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+ def forward(self, inputs):
+ x = self.conv1(inputs)
+ x = self.bn1(x)
+ x = functional.relu(x)
+
+ x = self.conv2(x)
+ x = self.bn2(x)
+
+ shortcut = inputs
+ x += shortcut
+
+ return functional.relu(x)
+
+
+class ResNetFeatures(nn.Layer):
+ def __init__(self, in_filters=3,
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]):
+ super(ResNetFeatures, self).__init__()
+ self.conv1 = nn.Conv2D(in_channels=in_filters, out_channels=64, kernel_size=(7, 7),
+ stride=(2, 2), padding=(3, 3),
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=False)
+ self.bn1 = nn.BatchNorm2D(num_features=64, momentum=0.99, epsilon=1e-03,
+ weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+ )
+ self.max_pool = nn.MaxPool2D(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+
+ self.layer2a = ResNetBlock(64, 64)
+ self.layer2b = ResNetBlock(64, 64)
+
+ self.mean = paddle.to_tensor(mean)
+ self.std = paddle.to_tensor(std)
+
+ def forward(self, inputs):
+ shape = inputs.shape
+ x = paddle.reshape(inputs, [shape[0] * shape[2], shape[1], shape[3], shape[4]])
+ x = (x - self.mean) / self.std
+
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = functional.relu(x)
+ x = self.max_pool(x)
+ x = self.layer2a(x)
+ x = self.layer2b(x)
+
+ new_shape = x.shape
+ x = paddle.reshape(x, [shape[0], new_shape[1], shape[2], new_shape[2], new_shape[3]])
+ return x
+
+
+class FrameSimilarity(nn.Layer):
+ def __init__(self,
+ in_filters,
+ similarity_dim=128,
+ lookup_window=101,
+ output_dim=128,
+ stop_gradient=False,
+ use_bias=False):
+ super(FrameSimilarity, self).__init__()
+ self.projection = nn.Linear(in_filters, similarity_dim,
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=use_bias)
+ self.fc = nn.Linear(lookup_window, output_dim,
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+ self.lookup_window = lookup_window
+ self.stop_gradient = stop_gradient
+ assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+
+ def forward(self, inputs):
+ x = paddle.concat([paddle.mean(x, axis=[3, 4]) for x in inputs], axis=1)
+ x = paddle.transpose(x, (0, 2, 1))
+
+ if self.stop_gradient:
+ x = x.stop_gradient
+
+ x = self.projection(x)
+ x = functional.normalize(x, p=2, axis=2)
+ batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]
+ time_window = x.shape[1]
+ similarities = paddle.bmm(x, x.transpose([0, 2, 1])) # [batch_size, time_window, time_window]
+
+ similarities_padded = functional.pad(similarities,
+ [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],
+ data_format='NCL')
+
+ batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])
+ batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])
+ time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])
+ time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])
+ lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])
+ lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices
+ indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)
+ similarities = paddle.gather_nd(similarities_padded, indices)
+ return functional.relu(self.fc(similarities))
+
+
+class ConvexCombinationRegularization(nn.Layer):
+ def __init__(self, in_filters, filters=32, delta_scale=10., loss_weight=0.01):
+ super(ConvexCombinationRegularization, self).__init__()
+
+ self.projection = nn.Conv3D(in_filters, filters, kernel_size=1, dilation=1, padding=(0, 0, 0),
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+ self.features = nn.Conv3D((filters * 3), filters * 2,
+ kernel_size=(3, 3, 3), dilation=1, padding=(1, 1, 1),
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+ self.dense = nn.Linear(64, 1, weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), bias_attr=True)
+ self.loss = nn.SmoothL1Loss(reduction='none')
+ self.delta_scale = delta_scale
+ self.loss_weight = loss_weight
+
+ def forward(self, image_inputs, feature_inputs):
+ x = feature_inputs
+ x = self.projection(x)
+ x = functional.relu(x)
+ batch_size = x.shape[0]
+ window_size = x.shape[2]
+ first_frame = paddle.tile(x[:, :, :1], [1, 1, window_size, 1, 1])
+ last_frame = paddle.tile(x[:, :, -1:], [1, 1, window_size, 1, 1])
+ x = paddle.concat([x, first_frame, last_frame], 1)
+ x = self.features(x)
+ x = functional.relu(x)
+ x = paddle.mean(x, axis=[3, 4])
+ x = paddle.transpose(x, (0, 2, 1))
+ alpha = self.dense(x)
+ alpha = paddle.transpose(alpha, (0, 2, 1))
+
+ first_img = paddle.tile(image_inputs[:, :, :1], [1, 1, window_size, 1, 1])
+ last_img = paddle.tile(image_inputs[:, :, -1:], [1, 1, window_size, 1, 1])
+
+ alpha_ = functional.sigmoid(alpha)
+ alpha_ = paddle.reshape(alpha_, [batch_size, 1, window_size, 1, 1])
+ predictions_ = (alpha_ * first_img + (1 - alpha_) * last_img)
+ loss_ = self.loss(label=image_inputs / self.delta_scale, input=predictions_ / self.delta_scale)
+ loss_ = self.loss_weight * paddle.mean(loss_)
+ return alpha, loss_
+
+
+class ColorHistograms(nn.Layer):
+ def __init__(self,
+ lookup_window=101,
+ output_dim=None):
+ super(ColorHistograms, self).__init__()
+
+ self.fc = nn.Linear(lookup_window, output_dim,
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=ParamAttr(
+ initializer=nn.initializer.Constant(value=0.))) if output_dim is not None else None
+ self.lookup_window = lookup_window
+ assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+
+ def compute_color_histograms(self, frames):
+ frames = frames.astype('int32')
+
+ def get_bin(frames):
+ # returns 0 .. 511
+ R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2]
+ R, G, B = R // 32, G // 32, B // 32
+ return (R * 64) + (G * 8) + B
+
+ batch_size = paddle.slice(frames.shape, starts=[0], ends=[1], axes=[0]) if frames.shape[0] == -1 else frames.shape[0]
+ time_window, height, width, no_channels = frames.shape[1:]
+
+ assert no_channels == 3 or no_channels == 6
+ if no_channels == 3:
+ frames_flatten = frames.reshape([-1, height * width, 3])
+ else:
+ frames_flatten = frames.reshape([-1, height * width * 2, 3])
+
+ binned_values = get_bin(frames_flatten)
+
+ frame_bin_prefix = (paddle.arange(0, batch_size * time_window) * 512).reshape([-1, 1])
+ binned_values = (binned_values + frame_bin_prefix).reshape([-1, 1])
+ histograms = paddle.zeros_like(frame_bin_prefix, dtype='int32').tile([512]).reshape([-1])
+ histograms = histograms.scatter_nd_add(binned_values, paddle.ones_like(binned_values, dtype='int32').reshape([-1]))
+ histograms = histograms.reshape([batch_size, time_window, 512]).astype('float32')
+ histograms_normalized = functional.normalize(histograms, p=2, axis=2)
+ return histograms_normalized
+
+ def forward(self, inputs):
+ x = self.compute_color_histograms(inputs)
+ batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]
+ time_window = x.shape[1]
+ similarities = paddle.bmm(x, x.transpose([0, 2, 1])) # [batch_size, time_window, time_window]
+ similarities_padded = functional.pad(similarities,
+ [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],
+ data_format='NCL')
+
+ batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])
+ batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])
+ time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])
+ time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])
+ lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])
+ lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices
+
+ indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)
+ similarities = paddle.gather_nd(similarities_padded, indices)
+
+ if self.fc is not None:
+ return functional.relu(self.fc(similarities))
+ return similarities
+
+
+@BACKBONES.register()
+class TransNetV2(nn.Layer):
+ """TransNetV2 model from
+ `"TransNet V2: An effective deep network architecture for fast shot transition detection" `_
+ """
+ def __init__(self,
+ F=16, L=3, S=2, D=1024,
+ use_many_hot_targets=True,
+ use_frame_similarity=True,
+ use_color_histograms=True,
+ use_mean_pooling=False,
+ dropout_rate=0.5,
+ use_convex_comb_reg=False,
+ use_resnet_features=False,
+ use_resnet_like_top=False,
+ frame_similarity_on_last_layer=False,
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]):
+ super(TransNetV2, self).__init__()
+
+ self.mean = np.array(mean, np.float32).reshape([1, 3, 1, 1]) * 255
+ self.std = np.array(std, np.float32).reshape([1, 3, 1, 1]) * 255
+
+ self.use_resnet_features = use_resnet_features
+ self.resnet_layers = ResNetFeatures(in_filters=3, mean=self.mean, std=self.std) if self.use_resnet_features else None
+ self.resnet_like_top = use_resnet_like_top
+ if self.resnet_like_top:
+ self.resnet_like_top_conv = nn.Conv3D(64 if self.use_resnet_features else 3, 32, kernel_size=(3, 7, 7),
+ stride=(1, 2, 2),
+ padding=(1, 3, 3),
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=False)
+ self.resnet_like_top_bn = nn.BatchNorm3D(32, momentum=0.99, epsilon=1e-03,
+ weight_attr=ParamAttr(
+ initializer=nn.initializer.Constant(value=1.)),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+ self.resnet_like_top_max_pool = nn.MaxPool3D(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+ padding=(0, 1, 1))
+
+ if self.resnet_like_top:
+ in_filters = 32
+ elif self.use_resnet_features:
+ in_filters = 64
+ else:
+ in_filters = 3
+ self.SDDCNN = nn.LayerList(
+ [StackedDDCNNV2(in_filters=in_filters, n_blocks=S, filters=F,
+ stochastic_depth_drop_prob=0.)] +
+ [StackedDDCNNV2(in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2 ** i) for i in range(1, L)]
+ )
+
+ self.frame_sim_layer = FrameSimilarity(
+ sum([(F * 2 ** i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128,
+ use_bias=True
+ ) if use_frame_similarity else None
+ self.color_hist_layer = ColorHistograms(
+ lookup_window=101, output_dim=128
+ ) if use_color_histograms else None
+
+ self.dropout = nn.Dropout(dropout_rate) if dropout_rate is not None else None
+
+ output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6 # 3x6 for spatial dimensions
+ if use_frame_similarity: output_dim += 128
+ if use_color_histograms: output_dim += 128
+
+ self.use_mean_pooling = use_mean_pooling
+
+ self.has_downsample = False
+ if self.use_resnet_features or self.resnet_like_top or self.use_mean_pooling:
+ self.has_downsample = True
+ self.fc1 = nn.Linear(512 if self.has_downsample else output_dim, D,
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+ )
+ self.frame_similarity_on_last_layer = frame_similarity_on_last_layer
+ self.cls_layer1 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+ )
+ self.cls_layer2 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,
+ weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+ bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+ ) if use_many_hot_targets else None
+
+ self.convex_comb_reg = ConvexCombinationRegularization(
+ in_filters=(F * 2 ** (L - 1) * 4)) if use_convex_comb_reg else None
+
+ def forward(self, inputs):
+ assert list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == paddle.float32, \
+ "incorrect input type and/or shape"
+ out_dict = {}
+
+ # shape [B, T, H, W, 3] to shape [B, 3, T, H, W]
+ x = inputs.transpose([0, 4, 1, 2, 3])
+ if self.use_resnet_features:
+ x = self.resnet_layers(x)
+ else:
+ x = x / 255.
+ inputs = inputs.clip(min=0).astype('uint8')
+ if self.resnet_like_top:
+ x = self.resnet_like_top_conv(x)
+ x = self.resnet_like_top_bn(x)
+ x = self.resnet_like_top_max_pool(x)
+ block_features = []
+ for block in self.SDDCNN:
+ x = block(x)
+ block_features.append(x)
+ if self.convex_comb_reg is not None:
+ out_dict["alphas"], out_dict["comb_reg_loss"] = self.convex_comb_reg(inputs.transpose([0, 4, 1, 2, 3]), x)
+ if self.use_mean_pooling:
+ x = paddle.mean(x, axis=[3, 4])
+ x = x.transpose([0, 2, 1])
+ else:
+ x = x.transpose([0, 2, 3, 4, 1])
+ x = x.reshape([x.shape[0], x.shape[1], x.shape[2]*x.shape[3]*x.shape[4]])
+ if self.frame_sim_layer is not None:
+ x = paddle.concat([self.frame_sim_layer(block_features), x], 2)
+ if self.color_hist_layer is not None:
+ x = paddle.concat([self.color_hist_layer(inputs), x], 2)
+ x = self.fc1(x)
+ x = functional.relu(x)
+ if self.dropout is not None:
+ x = self.dropout(x)
+ if self.frame_sim_layer is not None and self.frame_similarity_on_last_layer:
+ x = paddle.concat([self.frame_sim_layer(block_features), x], 2)
+ one_hot = self.cls_layer1(x)
+ if self.cls_layer2 is not None:
+ out_dict["many_hot"] = self.cls_layer2(x)
+
+ if len(out_dict) > 0:
+ return one_hot, out_dict
+
+ return one_hot
+
diff --git a/paddlevideo/modeling/backbones/vit.py b/paddlevideo/modeling/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ecebd9191a4c2ace3f04c2b28e88a82c8ee96c
--- /dev/null
+++ b/paddlevideo/modeling/backbones/vit.py
@@ -0,0 +1,465 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+ return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ # issuecomment-532968956 ...
+ See discussion: https://github.com/tensorflow/tpu/issues/494
+ """
+ if drop_prob == 0. or not training:
+ return x
+ keep_prob = paddle.to_tensor(1 - drop_prob)
+ shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+ random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+ random_tensor = paddle.floor(random_tensor) # binarize
+ output = x.divide(keep_prob) * random_tensor
+
+ return output
+
+
+class DropPath(nn.Layer):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+ def __init__(self):
+ super(Identity, self).__init__()
+
+ def forward(self, input):
+ return input
+
+
+class Mlp(nn.Layer):
+ def __init__(self,
+ in_features,
+ hidden_features=None,
+ out_features=None,
+ act_layer=nn.GELU,
+ drop=0.0):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class Attention(nn.Layer):
+ def __init__(self,
+ dim,
+ num_heads=8,
+ qkv_bias=False,
+ qk_scale=None,
+ attn_drop=0.0,
+ proj_drop=0.0):
+ super().__init__()
+
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim**-0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+ self.attn_drop = nn.Dropout(attn_drop)
+
+ def forward(self, x):
+ N, C = x.shape[1:]
+ qkv = self.qkv(x).reshape(
+ (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+ (2, 0, 3, 1, 4))
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+ attn = nn.functional.softmax(attn, axis=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class Block(nn.Layer):
+ def __init__(self,
+ dim,
+ num_heads,
+ mlp_ratio=4.0,
+ qkv_bias=False,
+ qk_scale=None,
+ drop=0.0,
+ attn_drop=0.0,
+ drop_path=0.1,
+ act_layer=nn.GELU,
+ norm_layer='nn.LayerNorm',
+ epsilon=1e-5,
+ attention_type='divided_space_time'):
+
+ super().__init__()
+ self.attention_type = attention_type
+ if isinstance(norm_layer, str):
+ self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+ elif isinstance(norm_layer, Callable):
+ self.norm1 = norm_layer(dim, epsilon=epsilon)
+ else:
+ raise TypeError(
+ "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+ self.attn = Attention(dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ attn_drop=attn_drop,
+ proj_drop=drop)
+
+ # Temporal Attention Parameters
+ if self.attention_type == 'divided_space_time':
+ if isinstance(norm_layer, str):
+ self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+ elif isinstance(norm_layer, Callable):
+ self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+ else:
+ raise TypeError(
+ "The norm_layer must be str or paddle.nn.layer.Layer class")
+ self.temporal_attn = Attention(dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ attn_drop=attn_drop,
+ proj_drop=drop)
+ self.temporal_fc = nn.Linear(dim, dim)
+
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+ if isinstance(norm_layer, str):
+ self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+ elif isinstance(norm_layer, Callable):
+ self.norm2 = norm_layer(dim, epsilon=epsilon)
+ else:
+ raise TypeError(
+ "The norm_layer must be str or paddle.nn.layer.Layer class")
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim,
+ hidden_features=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=drop)
+
+ def forward(self, x, B, T, W):
+ num_spatial_tokens = (x.shape[1] - 1) // T
+ H = num_spatial_tokens // W
+ if self.attention_type in ['space_only', 'joint_space_time']:
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+ elif self.attention_type == 'divided_space_time':
+ ########## Temporal ##########
+ xt = x[:, 1:, :]
+ _, _, _, _t, _m = B, H, W, T, xt.shape[-1]
+ xt = xt.reshape([-1, _t, _m])
+
+ res_temporal = self.drop_path(
+ self.temporal_attn(self.temporal_norm1(xt)))
+
+ _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
+ res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])
+
+ res_temporal = self.temporal_fc(res_temporal)
+ xt = x[:, 1:, :] + res_temporal
+
+ ########## Spatial ##########
+ init_cls_token = x[:, 0, :].unsqueeze(1)
+ cls_token = init_cls_token.tile((1, T, 1))
+ _b, _t, _m = cls_token.shape
+ cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)
+
+ xs = xt
+ _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
+ xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(
+ (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])
+ xs = paddle.concat((cls_token, xs), axis=1)
+ res_spatial = self.drop_path(self.attn(self.norm1(xs)))
+
+ # Taking care of CLS token
+ cls_token = res_spatial[:, 0, :]
+ _, _t, _m = B, T, cls_token.shape[-1]
+ cls_token = cls_token.reshape([-1, _t, _m])
+ # averaging for every frame
+ cls_token = paddle.mean(cls_token, axis=1, keepdim=True)
+
+ res_spatial = res_spatial[:, 1:, :]
+ _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
+ res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(
+ (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])
+
+ res = res_spatial
+ x = xt
+ x = paddle.concat((init_cls_token, x), axis=1) + paddle.concat(
+ (cls_token, res), axis=1)
+
+ # Mlp
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+ else:
+ raise NotImplementedError
+
+
+class PatchEmbed(nn.Layer):
+ """ Image to Patch Embedding
+ """
+ def __init__(self,
+ img_size=224,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+ patch_size[0])
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+
+ self.proj = nn.Conv2D(in_channels,
+ embed_dim,
+ kernel_size=patch_size,
+ stride=patch_size)
+
+ def forward(self, x):
+ B, C, T, H, W = x.shape
+
+ assert H == self.img_size[0] and W == self.img_size[1], \
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+ x = x.transpose((0, 2, 1, 3, 4))
+ x = x.reshape([-1, C, H, W])
+ x = self.proj(x)
+ W = x.shape[-1]
+ x = x.flatten(2).transpose((0, 2, 1))
+ return x, T, W
+
+
+@BACKBONES.register()
+class VisionTransformer(nn.Layer):
+ """ Vision Transformer with support for patch input
+ """
+ def __init__(self,
+ pretrained=None,
+ img_size=224,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ qkv_bias=False,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.1,
+ norm_layer='nn.LayerNorm',
+ epsilon=1e-5,
+ num_seg=8,
+ attention_type='divided_space_time',
+ **args):
+ super().__init__()
+ self.pretrained = pretrained
+ self.num_seg = num_seg
+ self.attention_type = attention_type
+ self.num_features = self.embed_dim = embed_dim
+
+ self.patch_embed = PatchEmbed(img_size=img_size,
+ patch_size=patch_size,
+ in_channels=in_channels,
+ embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ # Positional Embeddings
+ self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),
+ default_initializer=zeros_)
+ self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,
+ embed_dim),
+ default_initializer=zeros_)
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ if self.attention_type != 'space_only':
+ self.time_embed = self.create_parameter(shape=(1, num_seg,
+ embed_dim),
+ default_initializer=zeros_)
+ self.time_drop = nn.Dropout(p=drop_rate)
+
+ self.add_parameter("pos_embed", self.pos_embed)
+ self.add_parameter("cls_token", self.cls_token)
+
+ dpr = np.linspace(0, drop_path_rate, depth)
+
+ self.blocks = nn.LayerList([
+ Block(dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate,
+ attn_drop=attn_drop_rate,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ epsilon=epsilon,
+ attention_type=self.attention_type) for i in range(depth)
+ ])
+
+ self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+ def init_weights(self):
+ """First init model's weight"""
+ trunc_normal_(self.pos_embed, std=0.02)
+ trunc_normal_(self.cls_token, std=0.02)
+ self.apply(self._init_fn)
+
+ if self.attention_type == 'divided_space_time':
+ i = 0
+ for m in self.blocks.sublayers(include_self=True):
+ m_str = str(m)
+ if 'Block' in m_str:
+ if i > 0:
+ zeros_(m.temporal_fc.weight)
+ zeros_(m.temporal_fc.bias)
+ i += 1
+ """Second, if provide pretrained ckpt, load it"""
+ if isinstance(
+ self.pretrained, str
+ ) and self.pretrained.strip() != "": # load pretrained weights
+ load_ckpt(self,
+ self.pretrained,
+ num_patches=self.patch_embed.num_patches,
+ num_seg=self.num_seg,
+ attention_type=self.attention_type)
+
+ def _init_fn(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight)
+ if m.bias is not None:
+ zeros_(m.bias)
+ elif isinstance(m, nn.LayerNorm):
+ ones_(m.weight)
+ zeros_(m.bias)
+
+ def forward_features(self, x):
+ # B = x.shape[0]
+ B = paddle.shape(x)[0]
+ x, T, W = self.patch_embed(x) # [BT,nH*nW,F]
+ cls_tokens = self.cls_token.expand((B * T, -1, -1)) # [1,1,F]->[BT,1,F]
+ x = paddle.concat((cls_tokens, x), axis=1)
+ pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+ if pos_interp:
+ pos_embed = self.pos_embed
+ cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+ other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+ (0, 2, 1))
+ P = int(other_pos_embed.shape[2]**0.5)
+ H = x.shape[1] // W
+ other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+ new_pos_embed = F.interpolate(other_pos_embed,
+ size=(H, W),
+ mode='nearest')
+ new_pos_embed = new_pos_embed.flatten(2)
+ new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+ new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+ axis=1)
+ x = x + new_pos_embed
+ else:
+ x = x + self.pos_embed
+
+ x = self.pos_drop(x)
+
+ # Time Embeddings
+ if self.attention_type != 'space_only':
+ cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(
+ T)[0].index_select(paddle.to_tensor([0]), axis=1)
+ x = x[:, 1:]
+ _, _n, _m = x.shape
+ _t = T
+ x = x.reshape([-1, _t, _n, _m]).transpose(
+ (0, 2, 1, 3)).reshape([-1, _t, _m])
+ # Resizing time embeddings in case they don't match
+ time_interp = (T != self.time_embed.shape[1])
+ if time_interp: # T' != T
+ time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)
+ new_time_embed = F.interpolate(time_embed,
+ size=(T, x.shape[-1]),
+ mode='nearest').squeeze(0)
+ new_time_embed = new_time_embed.transpose((0, 2, 1))
+ x = x + new_time_embed
+ else:
+ x = x + self.time_embed
+
+ x = self.time_drop(x)
+ _, _t, _m = x.shape
+ x = x.reshape([-1, W * W * T, _m])
+ x = paddle.concat((cls_tokens, x), axis=1)
+
+ # Attention blocks
+ for blk in self.blocks:
+ x = blk(x, B, T, W)
+
+ # Predictions for space-only baseline
+ if self.attention_type == 'space_only':
+ _, _n, _m = x.shape
+ _t = T
+ x = x.reshape([-1, _t, _n, _m])
+ x = paddle.mean(x, 1) # averaging predictions for every frame
+
+ x = self.norm(x)
+ return x[:, 0] # [B, embed_dim]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ return x
diff --git a/paddlevideo/modeling/backbones/vit_tweaks.py b/paddlevideo/modeling/backbones/vit_tweaks.py
new file mode 100644
index 0000000000000000000000000000000000000000..a20af30f17283546f02c4e084730080f0c2f8c18
--- /dev/null
+++ b/paddlevideo/modeling/backbones/vit_tweaks.py
@@ -0,0 +1,515 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.regularizer import L2Decay
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer_tweaks']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+ return tuple([x] * 2)
+
+
+def rand_bbox(size, lam):
+ """ rand_bbox """
+ w = size[2]
+ h = size[3]
+ cut_rat = np.sqrt(1. - lam)
+ cut_w = np.int(w * cut_rat)
+ cut_h = np.int(h * cut_rat)
+
+ # uniform
+ cx = np.random.randint(w)
+ cy = np.random.randint(h)
+
+ bbx1 = np.clip(cx - cut_w // 2, 0, w)
+ bby1 = np.clip(cy - cut_h // 2, 0, h)
+ bbx2 = np.clip(cx + cut_w // 2, 0, w)
+ bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+ return bbx1, bby1, bbx2, bby2
+
+
+def drop_path(x, drop_prob=0., training=False):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ # issuecomment-532968956 ...
+ See discussion: https://github.com/tensorflow/tpu/issues/494
+ """
+ if drop_prob == 0. or not training:
+ return x
+ keep_prob = paddle.to_tensor(1 - drop_prob)
+ shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+ random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+ random_tensor = paddle.floor(random_tensor) # binarize
+ output = x.divide(keep_prob) * random_tensor
+
+ return output
+
+
+class DropPath(nn.Layer):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+ def __init__(self):
+ super(Identity, self).__init__()
+
+ def forward(self, input):
+ return input
+
+
+class Mlp(nn.Layer):
+ def __init__(self,
+ in_features,
+ hidden_features=None,
+ out_features=None,
+ act_layer=nn.GELU,
+ drop=0.,
+ wd_bias=True,
+ lr_mult=1.0):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class Attention(nn.Layer):
+ def __init__(self,
+ dim,
+ num_heads=8,
+ qkv_bias=False,
+ qk_scale=None,
+ attn_drop=0.,
+ proj_drop=0.,
+ wd_bias=True,
+ lr_mult=1.0):
+ super().__init__()
+
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim**-0.5
+ self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+ self.attn_drop = nn.Dropout(attn_drop)
+
+ def forward(self, x):
+ N, C = x.shape[1:]
+ qkv = self.qkv(x).reshape(
+ (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+ (2, 0, 3, 1, 4))
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+ attn = nn.functional.softmax(attn, axis=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class Block(nn.Layer):
+ def __init__(self,
+ dim,
+ num_heads,
+ mlp_ratio=4.0,
+ qkv_bias=False,
+ qk_scale=None,
+ drop=0.0,
+ attn_drop=0.0,
+ drop_path=0.1,
+ act_layer=nn.GELU,
+ norm_layer='nn.LayerNorm',
+ epsilon=1e-5,
+ attention_type='divided_space_time',
+ wd_bias=True,
+ lr_mult=1.0):
+
+ super().__init__()
+ self.attention_type = attention_type
+ if isinstance(norm_layer, str):
+ self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+ elif isinstance(norm_layer, Callable):
+ self.norm1 = norm_layer(dim, epsilon=epsilon)
+ else:
+ raise TypeError(
+ "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+ self.attn = Attention(dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ attn_drop=attn_drop,
+ proj_drop=drop,
+ wd_bias=wd_bias,
+ lr_mult=lr_mult)
+
+ # Temporal Attention Parameters
+ if self.attention_type == 'divided_space_time':
+ if isinstance(norm_layer, str):
+ self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+ elif isinstance(norm_layer, Callable):
+ self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+ else:
+ raise TypeError(
+ "The norm_layer must be str or paddle.nn.layer.Layer class")
+ self.temporal_attn = Attention(dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ attn_drop=attn_drop,
+ proj_drop=drop,
+ wd_bias=wd_bias,
+ lr_mult=lr_mult)
+ self.temporal_fc = nn.Linear(dim, dim)
+
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+ if isinstance(norm_layer, str):
+ self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+ elif isinstance(norm_layer, Callable):
+ self.norm2 = norm_layer(dim, epsilon=epsilon)
+ else:
+ raise TypeError(
+ "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim,
+ hidden_features=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=drop,
+ wd_bias=wd_bias,
+ lr_mult=lr_mult)
+
+ def forward(self, x, B, T, W):
+ num_spatial_tokens = (x.shape[1] - 1) // T
+ H = num_spatial_tokens // W
+ if self.attention_type in ['space_only', 'joint_space_time']:
+ x = paddle.add(x, self.drop_path(self.attn(self.norm1(x))))
+ x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))
+ return x
+ elif self.attention_type == 'divided_space_time':
+ ########## Temporal ##########
+ xt = x[:, 1:, :]
+ _, _, _, _t, _m = B, H, W, T, xt.shape[-1]
+ xt = xt.reshape([-1, _t, _m])
+
+ res_temporal = self.drop_path(
+ self.temporal_attn(self.temporal_norm1(xt)))
+
+ _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
+ res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])
+
+ res_temporal = self.temporal_fc(res_temporal)
+ xt = paddle.add(x[:, 1:, :], res_temporal)
+
+ ########## Spatial ##########
+ init_cls_token = x[:, 0, :].unsqueeze(1)
+ cls_token = init_cls_token.tile((1, T, 1))
+ _b, _t, _m = cls_token.shape
+ cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)
+
+ xs = xt
+ _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
+ xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(
+ (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])
+ xs = paddle.concat((cls_token, xs), axis=1)
+ res_spatial = self.drop_path(self.attn(self.norm1(xs)))
+
+ # Taking care of CLS token
+ cls_token = res_spatial[:, 0, :]
+ _, _t, _m = B, T, cls_token.shape[-1]
+ cls_token = cls_token.reshape([-1, _t, _m])
+ # averaging for every frame
+ cls_token = paddle.mean(cls_token, axis=1, keepdim=True)
+
+ res_spatial = res_spatial[:, 1:, :]
+ _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
+ res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(
+ (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])
+
+ res = res_spatial
+ x = xt
+ x = paddle.add(paddle.concat((init_cls_token, x), axis=1),
+ paddle.concat((cls_token, res), axis=1))
+ # Mlp
+ x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))
+ return x
+ else:
+ raise NotImplementedError
+
+
+class PatchEmbed(nn.Layer):
+ """ Image to Patch Embedding
+ """
+ def __init__(self,
+ img_size=224,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ wd_bias=True,
+ lr_mult=1.0):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+ patch_size[0])
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+
+ self.proj = nn.Conv2D(in_channels,
+ embed_dim,
+ kernel_size=patch_size,
+ stride=patch_size)
+
+ def forward(self, x):
+ B, C, T, H, W = x.shape
+
+ assert H == self.img_size[0] and W == self.img_size[1], \
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+ x = x.transpose((0, 2, 1, 3, 4)) # [B,T,C,H,W]
+ x = x.reshape([-1, C, H, W]) # [BT,C,H,W]
+ x = self.proj(x) # [BT,F,nH,nW]
+ W = x.shape[-1]
+ x = x.flatten(2).transpose((0, 2, 1)) # [BT,F,nHnW]
+ return x, T, W
+
+
+@BACKBONES.register()
+class VisionTransformer_tweaks(nn.Layer):
+ """ Vision Transformer with support for patch input
+ """
+ def __init__(self,
+ pretrained=None,
+ img_size=224,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ qkv_bias=False,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.1,
+ norm_layer='nn.LayerNorm',
+ epsilon=1e-5,
+ num_seg=8,
+ attention_type='divided_space_time',
+ wd_bias=True,
+ lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+ **args):
+ super().__init__()
+ self.pretrained = pretrained
+ self.num_seg = num_seg
+ self.attention_type = attention_type
+ self.lr_mult_list = lr_mult_list
+ self.num_features = self.embed_dim = embed_dim
+
+ self.patch_embed = PatchEmbed(img_size=img_size,
+ patch_size=patch_size,
+ in_channels=in_channels,
+ embed_dim=embed_dim,
+ wd_bias=wd_bias,
+ lr_mult=self.lr_mult_list[0])
+ num_patches = self.patch_embed.num_patches
+
+ # Positional Embeddings
+ self.cls_token = self.create_parameter(
+ shape=(1, 1, embed_dim),
+ default_initializer=zeros_,
+ attr=ParamAttr(regularizer=L2Decay(0.0)))
+ self.pos_embed = self.create_parameter(
+ shape=(1, num_patches + 1, embed_dim),
+ default_initializer=zeros_,
+ attr=ParamAttr(regularizer=L2Decay(0.0)))
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ if self.attention_type != 'space_only':
+ self.time_embed = self.create_parameter(
+ shape=(1, num_seg, embed_dim),
+ default_initializer=zeros_,
+ attr=ParamAttr(regularizer=L2Decay(0.0)))
+ self.time_drop = nn.Dropout(p=drop_rate)
+
+ self.add_parameter("pos_embed", self.pos_embed)
+ self.add_parameter("cls_token", self.cls_token)
+
+ dpr = np.linspace(0, drop_path_rate, depth)
+
+ self.blocks = nn.LayerList([
+ Block(dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate,
+ attn_drop=attn_drop_rate,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ epsilon=epsilon,
+ attention_type=self.attention_type,
+ wd_bias=wd_bias,
+ lr_mult=self.lr_mult_list[(i // 4) + 1]) for i in range(depth)
+ ])
+
+ self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+ def init_weights(self):
+ """First init model's weight"""
+ trunc_normal_(self.pos_embed, std=0.02)
+ trunc_normal_(self.cls_token, std=0.02)
+ self.apply(self._init_fn)
+
+ if self.attention_type == 'divided_space_time':
+ i = 0
+ for m in self.blocks.sublayers(include_self=True):
+ m_str = str(m)
+ if 'Block' in m_str:
+ if i > 0:
+ zeros_(m.temporal_fc.weight)
+ zeros_(m.temporal_fc.bias)
+ i += 1
+ """Second, if provide pretrained ckpt, load it"""
+ if isinstance(
+ self.pretrained, str
+ ) and self.pretrained.strip() != "": # load pretrained weights
+ load_ckpt(self,
+ self.pretrained,
+ num_patches=self.patch_embed.num_patches,
+ num_seg=self.num_seg,
+ attention_type=self.attention_type)
+ elif self.pretrained is None or self.pretrained.strip() == "":
+ pass
+ else:
+ raise NotImplementedError
+
+ def _init_fn(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight)
+ if m.bias is not None:
+ zeros_(m.bias)
+ elif isinstance(m, nn.LayerNorm):
+ ones_(m.weight)
+ zeros_(m.bias)
+
+ def forward_features(self, x):
+ # B = x.shape[0]
+ B = paddle.shape(x)[0]
+ x, T, W = self.patch_embed(x) # [BT,nH*nW,F]
+ cls_tokens = self.cls_token.expand((B * T, -1, -1)) # [1,1,F]->[BT,1,F]
+ x = paddle.concat((cls_tokens, x), axis=1)
+ pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+ if pos_interp:
+ pos_embed = self.pos_embed
+ cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+ other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+ (0, 2, 1))
+ P = int(other_pos_embed.shape[2]**0.5)
+ H = x.shape[1] // W
+ other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+ new_pos_embed = F.interpolate(other_pos_embed,
+ size=(H, W),
+ mode='nearest')
+ new_pos_embed = new_pos_embed.flatten(2)
+ new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+ new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+ axis=1)
+ x = paddle.add(x, new_pos_embed)
+ else:
+ x = paddle.add(x, self.pos_embed)
+
+ x = self.pos_drop(x)
+
+ # Time Embeddings
+ if self.attention_type != 'space_only':
+ cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(
+ T)[0].index_select(paddle.to_tensor([0]), axis=1)
+ x = x[:, 1:]
+ _, _n, _m = x.shape
+ _t = T
+ x = x.reshape([-1, _t, _n, _m]).transpose(
+ (0, 2, 1, 3)).reshape([-1, _t, _m])
+ # Resizing time embeddings in case they don't match
+ time_interp = (T != self.time_embed.shape[1])
+ if time_interp: # T' != T
+ time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)
+ new_time_embed = F.interpolate(time_embed,
+ size=(T, x.shape[-1]),
+ mode='nearest').squeeze(0)
+ new_time_embed = new_time_embed.transpose((0, 2, 1))
+ x = paddle.add(x, new_time_embed)
+ else:
+ x = paddle.add(x, self.time_embed)
+
+ x = self.time_drop(x)
+ _, _t, _m = x.shape
+ x = x.reshape([-1, W * W * T, _m])
+ x = paddle.concat((cls_tokens, x), axis=1)
+
+ # Attention blocks
+ for blk in self.blocks:
+ x = blk(x, B, T, W)
+
+ # Predictions for space-only baseline
+ if self.attention_type == 'space_only':
+ _, _n, _m = x.shape
+ _t = T
+ x = x.reshape([-1, _t, _n, _m])
+ x = paddle.mean(x, 1) # averaging predictions for every frame
+
+ x = self.norm(x)
+ return x[:, 0] # [B, embed_dim]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ return x
diff --git a/paddlevideo/modeling/bbox_utils.py b/paddlevideo/modeling/bbox_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..23b4555b4f84c2cf0d4b7089a5e953acc4270c23
--- /dev/null
+++ b/paddlevideo/modeling/bbox_utils.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.functional as F
+import math
+import numpy as np
+
+
+def bbox2delta(src_boxes, tgt_boxes, weights):
+ src_w = src_boxes[:, 2] - src_boxes[:, 0]
+ src_h = src_boxes[:, 3] - src_boxes[:, 1]
+ src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+ src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+ tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+ tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+ tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+ tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+ wx, wy, ww, wh = weights
+ dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
+ dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
+ dw = ww * paddle.log(tgt_w / src_w)
+ dh = wh * paddle.log(tgt_h / src_h)
+
+ deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+ return deltas
+
+
+def delta2bbox(deltas, boxes, weights):
+ clip_scale = math.log(1000.0 / 16)
+
+ widths = boxes[:, 2] - boxes[:, 0]
+ heights = boxes[:, 3] - boxes[:, 1]
+ ctr_x = boxes[:, 0] + 0.5 * widths
+ ctr_y = boxes[:, 1] + 0.5 * heights
+
+ wx, wy, ww, wh = weights
+ dx = deltas[:, 0::4] / wx
+ dy = deltas[:, 1::4] / wy
+ dw = deltas[:, 2::4] / ww
+ dh = deltas[:, 3::4] / wh
+ # Prevent sending too large values into paddle.exp()
+ dw = paddle.clip(dw, max=clip_scale)
+ dh = paddle.clip(dh, max=clip_scale)
+
+ pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+ pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+ pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+ pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+ pred_boxes = []
+ pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+ pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+ pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+ pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+ pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+ return pred_boxes
+
+
+def expand_bbox(bboxes, scale):
+ w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+ h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+ x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+ y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+ w_half *= scale
+ h_half *= scale
+
+ bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
+ bboxes_exp[:, 0] = x_c - w_half
+ bboxes_exp[:, 2] = x_c + w_half
+ bboxes_exp[:, 1] = y_c - h_half
+ bboxes_exp[:, 3] = y_c + h_half
+
+ return bboxes_exp
+
+
+def clip_bbox(boxes, im_shape):
+ h, w = im_shape[0], im_shape[1]
+ x1 = boxes[:, 0].clip(0, w)
+ y1 = boxes[:, 1].clip(0, h)
+ x2 = boxes[:, 2].clip(0, w)
+ y2 = boxes[:, 3].clip(0, h)
+ return paddle.stack([x1, y1, x2, y2], axis=1)
+
+
+def nonempty_bbox(boxes, min_size=0, return_mask=False):
+ w = boxes[:, 2] - boxes[:, 0]
+ h = boxes[:, 3] - boxes[:, 1]
+ mask = paddle.logical_and(w > min_size, w > min_size)
+ if return_mask:
+ return mask
+ keep = paddle.nonzero(mask).flatten()
+ return keep
+
+
+def bbox_area(boxes):
+ return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+ """
+ Calculate overlaps between boxes1 and boxes2
+
+ Args:
+ boxes1 (Tensor): boxes with shape [M, 4]
+ boxes2 (Tensor): boxes with shape [N, 4]
+
+ Return:
+ overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+ """
+ area1 = bbox_area(boxes1)
+ area2 = bbox_area(boxes2)
+
+ xy_max = paddle.minimum(
+ paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+ xy_min = paddle.maximum(
+ paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+ width_height = xy_max - xy_min
+ width_height = width_height.clip(min=0)
+ inter = width_height.prod(axis=2)
+
+ overlaps = paddle.where(inter > 0, inter /
+ (paddle.unsqueeze(area1, 1) + area2 - inter),
+ paddle.zeros_like(inter))
+ return overlaps
+
+
+def xywh2xyxy(box):
+ x, y, w, h = box
+ x1 = x - w * 0.5
+ y1 = y - h * 0.5
+ x2 = x + w * 0.5
+ y2 = y + h * 0.5
+ return [x1, y1, x2, y2]
+
+
+def make_grid(h, w, dtype):
+ yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
+ return paddle.stack((xv, yv), 2).cast(dtype=dtype)
+
+
+def decode_yolo(box, anchor, downsample_ratio):
+ """decode yolo box
+
+ Args:
+ box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+ anchor (list): anchor with the shape [na, 2]
+ downsample_ratio (int): downsample ratio, default 32
+ scale (float): scale, default 1.
+
+ Return:
+ box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
+ """
+ x, y, w, h = box
+ na, grid_h, grid_w = x.shape[1:4]
+ grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
+ x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
+ y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
+
+ anchor = paddle.to_tensor(anchor)
+ anchor = paddle.cast(anchor, x.dtype)
+ anchor = anchor.reshape((1, na, 1, 1, 2))
+ w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
+ h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
+
+ return [x1, y1, w1, h1]
+
+
+def iou_similarity(box1, box2, eps=1e-9):
+ """Calculate iou of box1 and box2
+
+ Args:
+ box1 (Tensor): box with the shape [N, M1, 4]
+ box2 (Tensor): box with the shape [N, M2, 4]
+
+ Return:
+ iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
+ """
+ box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4]
+ box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4]
+ px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
+ gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
+ x1y1 = paddle.maximum(px1y1, gx1y1)
+ x2y2 = paddle.minimum(px2y2, gx2y2)
+ overlap = (x2y2 - x1y1).clip(0).prod(-1)
+ area1 = (px2y2 - px1y1).clip(0).prod(-1)
+ area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+ union = area1 + area2 - overlap + eps
+ return overlap / union
+
+
+def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
+ """calculate the iou of box1 and box2
+
+ Args:
+ box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+ box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+ giou (bool): whether use giou or not, default False
+ diou (bool): whether use diou or not, default False
+ ciou (bool): whether use ciou or not, default False
+ eps (float): epsilon to avoid divide by zero
+
+ Return:
+ iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
+ """
+ px1, py1, px2, py2 = box1
+ gx1, gy1, gx2, gy2 = box2
+ x1 = paddle.maximum(px1, gx1)
+ y1 = paddle.maximum(py1, gy1)
+ x2 = paddle.minimum(px2, gx2)
+ y2 = paddle.minimum(py2, gy2)
+
+ overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
+
+ area1 = (px2 - px1) * (py2 - py1)
+ area1 = area1.clip(0)
+
+ area2 = (gx2 - gx1) * (gy2 - gy1)
+ area2 = area2.clip(0)
+
+ union = area1 + area2 - overlap + eps
+ iou = overlap / union
+
+ if giou or ciou or diou:
+ # convex w, h
+ cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
+ ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
+ if giou:
+ c_area = cw * ch + eps
+ return iou - (c_area - union) / c_area
+ else:
+ # convex diagonal squared
+ c2 = cw**2 + ch**2 + eps
+ # center distance
+ rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
+ if diou:
+ return iou - rho2 / c2
+ else:
+ w1, h1 = px2 - px1, py2 - py1 + eps
+ w2, h2 = gx2 - gx1, gy2 - gy1 + eps
+ delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
+ v = (4 / math.pi**2) * paddle.pow(delta, 2)
+ alpha = v / (1 + eps - iou + v)
+ alpha.stop_gradient = True
+ return iou - (rho2 / c2 + v * alpha)
+ else:
+ return iou
+
+
+def rect2rbox(bboxes):
+ """
+ :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax)
+ :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle)
+ """
+ bboxes = bboxes.reshape(-1, 4)
+ num_boxes = bboxes.shape[0]
+
+ x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0
+ y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0
+ edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0])
+ edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1])
+ angles = np.zeros([num_boxes], dtype=bboxes.dtype)
+
+ inds = edges1 < edges2
+
+ rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1)
+ rboxes[inds, 2] = edges2[inds]
+ rboxes[inds, 3] = edges1[inds]
+ rboxes[inds, 4] = np.pi / 2.0
+ return rboxes
+
+
+def delta2rbox(Rrois,
+ deltas,
+ means=[0, 0, 0, 0, 0],
+ stds=[1, 1, 1, 1, 1],
+ wh_ratio_clip=1e-6):
+ """
+ :param Rrois: (cx, cy, w, h, theta)
+ :param deltas: (dx, dy, dw, dh, dtheta)
+ :param means:
+ :param stds:
+ :param wh_ratio_clip:
+ :return:
+ """
+ means = paddle.to_tensor(means)
+ stds = paddle.to_tensor(stds)
+ deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]])
+ denorm_deltas = deltas * stds + means
+
+ dx = denorm_deltas[:, 0]
+ dy = denorm_deltas[:, 1]
+ dw = denorm_deltas[:, 2]
+ dh = denorm_deltas[:, 3]
+ dangle = denorm_deltas[:, 4]
+
+ max_ratio = np.abs(np.log(wh_ratio_clip))
+ dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)
+ dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)
+
+ Rroi_x = Rrois[:, 0]
+ Rroi_y = Rrois[:, 1]
+ Rroi_w = Rrois[:, 2]
+ Rroi_h = Rrois[:, 3]
+ Rroi_angle = Rrois[:, 4]
+
+ gx = dx * Rroi_w * paddle.cos(Rroi_angle) - dy * Rroi_h * paddle.sin(
+ Rroi_angle) + Rroi_x
+ gy = dx * Rroi_w * paddle.sin(Rroi_angle) + dy * Rroi_h * paddle.cos(
+ Rroi_angle) + Rroi_y
+ gw = Rroi_w * dw.exp()
+ gh = Rroi_h * dh.exp()
+ ga = np.pi * dangle + Rroi_angle
+ ga = (ga + np.pi / 4) % np.pi - np.pi / 4
+ ga = paddle.to_tensor(ga)
+
+ gw = paddle.to_tensor(gw, dtype='float32')
+ gh = paddle.to_tensor(gh, dtype='float32')
+ bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1)
+ return bboxes
+
+
+def rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]):
+ """
+
+ Args:
+ proposals:
+ gt:
+ means: 1x5
+ stds: 1x5
+
+ Returns:
+
+ """
+ proposals = proposals.astype(np.float64)
+
+ PI = np.pi
+
+ gt_widths = gt[..., 2]
+ gt_heights = gt[..., 3]
+ gt_angle = gt[..., 4]
+
+ proposals_widths = proposals[..., 2]
+ proposals_heights = proposals[..., 3]
+ proposals_angle = proposals[..., 4]
+
+ coord = gt[..., 0:2] - proposals[..., 0:2]
+ dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4])
+ * coord[..., 1]) / proposals_widths
+ dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4])
+ * coord[..., 1]) / proposals_heights
+ dw = np.log(gt_widths / proposals_widths)
+ dh = np.log(gt_heights / proposals_heights)
+ da = (gt_angle - proposals_angle)
+
+ da = (da + PI / 4) % PI - PI / 4
+ da /= PI
+
+ deltas = np.stack([dx, dy, dw, dh, da], axis=-1)
+ means = np.array(means, dtype=deltas.dtype)
+ stds = np.array(stds, dtype=deltas.dtype)
+ deltas = (deltas - means) / stds
+ deltas = deltas.astype(np.float32)
+ return deltas
+
+
+def bbox_decode(bbox_preds,
+ anchors,
+ means=[0, 0, 0, 0, 0],
+ stds=[1, 1, 1, 1, 1]):
+ """decode bbox from deltas
+ Args:
+ bbox_preds: [N,H,W,5]
+ anchors: [H*W,5]
+ return:
+ bboxes: [N,H,W,5]
+ """
+ means = paddle.to_tensor(means)
+ stds = paddle.to_tensor(stds)
+ num_imgs, H, W, _ = bbox_preds.shape
+ bboxes_list = []
+ for img_id in range(num_imgs):
+ bbox_pred = bbox_preds[img_id]
+ # bbox_pred.shape=[5,H,W]
+ bbox_delta = bbox_pred
+ anchors = paddle.to_tensor(anchors)
+ bboxes = delta2rbox(
+ anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6)
+ bboxes = paddle.reshape(bboxes, [H, W, 5])
+ bboxes_list.append(bboxes)
+ return paddle.stack(bboxes_list, axis=0)
+
+
+def poly_to_rbox(polys):
+ """
+ poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+ to
+ rotated_boxes:[x_ctr,y_ctr,w,h,angle]
+ """
+ rotated_boxes = []
+ for poly in polys:
+ poly = np.array(poly[:8], dtype=np.float32)
+
+ pt1 = (poly[0], poly[1])
+ pt2 = (poly[2], poly[3])
+ pt3 = (poly[4], poly[5])
+ pt4 = (poly[6], poly[7])
+
+ edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[
+ 1]) * (pt1[1] - pt2[1]))
+ edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[
+ 1]) * (pt2[1] - pt3[1]))
+
+ width = max(edge1, edge2)
+ height = min(edge1, edge2)
+
+ rbox_angle = 0
+ if edge1 > edge2:
+ rbox_angle = np.arctan2(
+ np.float(pt2[1] - pt1[1]), np.float(pt2[0] - pt1[0]))
+ elif edge2 >= edge1:
+ rbox_angle = np.arctan2(
+ np.float(pt4[1] - pt1[1]), np.float(pt4[0] - pt1[0]))
+
+ def norm_angle(angle, range=[-np.pi / 4, np.pi]):
+ return (angle - range[0]) % range[1] + range[0]
+
+ rbox_angle = norm_angle(rbox_angle)
+
+ x_ctr = np.float(pt1[0] + pt3[0]) / 2
+ y_ctr = np.float(pt1[1] + pt3[1]) / 2
+ rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle])
+ rotated_boxes.append(rotated_box)
+ ret_rotated_boxes = np.array(rotated_boxes)
+ assert ret_rotated_boxes.shape[1] == 5
+ return ret_rotated_boxes
+
+
+def cal_line_length(point1, point2):
+ import math
+ return math.sqrt(
+ math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))
+
+
+def get_best_begin_point_single(coordinate):
+ x1, y1, x2, y2, x3, y3, x4, y4 = coordinate
+ xmin = min(x1, x2, x3, x4)
+ ymin = min(y1, y2, y3, y4)
+ xmax = max(x1, x2, x3, x4)
+ ymax = max(y1, y2, y3, y4)
+ combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
+ [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],
+ [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],
+ [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]
+ dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
+ force = 100000000.0
+ force_flag = 0
+ for i in range(4):
+ temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \
+ + cal_line_length(combinate[i][1], dst_coordinate[1]) \
+ + cal_line_length(combinate[i][2], dst_coordinate[2]) \
+ + cal_line_length(combinate[i][3], dst_coordinate[3])
+ if temp_force < force:
+ force = temp_force
+ force_flag = i
+ if force_flag != 0:
+ pass
+ return np.array(combinate[force_flag]).reshape(8)
+
+
+def rbox2poly_single(rrect):
+ """
+ rrect:[x_ctr,y_ctr,w,h,angle]
+ to
+ poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+ """
+ x_ctr, y_ctr, width, height, angle = rrect[:5]
+ tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
+ # rect 2x4
+ rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
+ R = np.array([[np.cos(angle), -np.sin(angle)],
+ [np.sin(angle), np.cos(angle)]])
+ # poly
+ poly = R.dot(rect)
+ x0, x1, x2, x3 = poly[0, :4] + x_ctr
+ y0, y1, y2, y3 = poly[1, :4] + y_ctr
+ poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
+ poly = get_best_begin_point_single(poly)
+ return poly
+
+
+def rbox2poly(rrects):
+ """
+ rrect:[x_ctr,y_ctr,w,h,angle]
+ to
+ poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+ """
+ polys = []
+ for rrect in rrects:
+ x_ctr, y_ctr, width, height, angle = rrect[:5]
+ tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
+ rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
+ R = np.array([[np.cos(angle), -np.sin(angle)],
+ [np.sin(angle), np.cos(angle)]])
+ poly = R.dot(rect)
+ x0, x1, x2, x3 = poly[0, :4] + x_ctr
+ y0, y1, y2, y3 = poly[1, :4] + y_ctr
+ poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
+ poly = get_best_begin_point_single(poly)
+ polys.append(poly)
+ polys = np.array(polys)
+ return polys
diff --git a/paddlevideo/modeling/builder.py b/paddlevideo/modeling/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..71503eb4d52541a89e840d67a104476a59abfeff
--- /dev/null
+++ b/paddlevideo/modeling/builder.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT, SEGMENTERS
+from ..utils import build
+from .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,
+ DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,
+ MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+
+
+def build_backbone(cfg):
+ """Build backbone."""
+ return build(cfg, BACKBONES)
+
+
+def build_roi_extractor(cfg):
+ """Build roi extractor."""
+ return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+ """Builder of box assigner."""
+ return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+ """Builder of box sampler."""
+ return build(cfg, BBOX_SAMPLERS)
+
+
+def build_roi_extractor(cfg):
+ """Build roi extractor."""
+ return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+ """Builder of box assigner."""
+ return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+ """Builder of box sampler."""
+ return build(cfg, BBOX_SAMPLERS)
+
+
+def build_head(cfg):
+ """Build head."""
+ return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+ """Build loss."""
+ return build(cfg, LOSSES)
+
+
+def build_recognizer(cfg):
+ """Build recognizer."""
+ return build(cfg, RECOGNIZERS, key='framework')
+
+
+def build_segmenter(cfg):
+ """Build segmenter."""
+ return build(cfg, SEGMENTERS, key='framework')
+
+
+def build_localizer(cfg):
+ """Build localizer."""
+ return build(cfg, LOCALIZERS, key='framework')
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+ """Build detector."""
+ return build(cfg, DETECTORS, key='framework')
+
+
+def build_partitioner(cfg):
+ """Build partitioner."""
+ return build(cfg, PARTITIONERS, key='framework')
+
+
+def build_estimator(cfg):
+ """Build estimator."""
+ return build(cfg, ESTIMATORS, key='framework')
+
+
+def build_multimodal(cfg):
+ """Build multimodal."""
+ return build(cfg, MULTIMODAL, key='framework')
+
+
+def build_segment(cfg):
+ """Build segment."""
+ return build(cfg, SEGMENT, key='framework')
+
+
+def build_model(cfg):
+ cfg_copy = cfg.copy()
+ framework_type = cfg_copy.get('framework')
+ if framework_type in RECOGNIZERS:
+ return build_recognizer(cfg)
+ elif framework_type in LOCALIZERS:
+ return build_localizer(cfg)
+ elif framework_type in PARTITIONERS:
+ return build_partitioner(cfg)
+ elif framework_type in DETECTORS:
+ return build_detector(cfg)
+ elif framework_type in ESTIMATORS:
+ return build_estimator(cfg)
+ elif framework_type in MULTIMODAL:
+ return build_multimodal(cfg)
+ elif framework_type in SEGMENTERS:
+ return build_segmenter(cfg)
+ elif framework_type in SEGMENT:
+ return build_segment(cfg)
+ else:
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/__init__.py b/paddlevideo/modeling/framework/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68fe09ac47d3160db6238279352bd1e4ed10dc4
--- /dev/null
+++ b/paddlevideo/modeling/framework/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .estimators import BaseEstimator, DepthEstimator
+from .localizers import BaseLocalizer, BMNLocalizer
+from .partitioners import BasePartitioner, TransNetV2Partitioner
+from .recognizers import BaseRecognizer, Recognizer2D
+from .multimodal import ActBert, BaseMultimodal
+from .segment import BaseSegment, CFBI
+from .segmenters import MSTCN
+
+__all__ = [
+ 'BaseRecognizer', 'Recognizer2D', 'BaseLocalizer', 'BMNLocalizer',
+ 'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator',
+ 'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI',
+ 'MSTCN'
+]
diff --git a/paddlevideo/modeling/framework/detectors/__init__.py b/paddlevideo/modeling/framework/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..74dcac0a36e852ef331c733e543342c9e22b752d
--- /dev/null
+++ b/paddlevideo/modeling/framework/detectors/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseDetector
+from .fast_rcnn import FastRCNN
+from .two_stage import TwoStageDetector
+
+__all__ = ['BaseDetector', 'TwoStageDetector', 'FastRCNN']
diff --git a/paddlevideo/modeling/framework/detectors/base.py b/paddlevideo/modeling/framework/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5ccb8feca0d8729e125aa6c8b13d076c742fb8
--- /dev/null
+++ b/paddlevideo/modeling/framework/detectors/base.py
@@ -0,0 +1,51 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+from ...registry import DETECTORS
+
+@DETECTORS.register()
+class BaseDetector(nn.Layer):
+ """Base class for detectors. """
+ def __init__(self, backbone=None, head=None):
+
+ super().__init__()
+
+ def init_weights(self):
+ """Initialize the model network weights. """
+ self.backbone.init_weights()
+ self.head.init_weights()
+
+ def extract_feature(self, imgs, iter_num):
+ """Extract features through a backbone. """
+ feature = self.backbone(imgs)
+ return feature
+
+ def forward(self, data_batch, mode='infer'):
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch, **kwargs):
+ """Training step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch, **kwargs):
+ """Validating step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch, **kwargs):
+ """Test step.
+ """
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/detectors/fast_rcnn.py b/paddlevideo/modeling/framework/detectors/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8f912dbea0f3a1f5f1a4d1157f1d3ae01793afb
--- /dev/null
+++ b/paddlevideo/modeling/framework/detectors/fast_rcnn.py
@@ -0,0 +1,34 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .two_stage import TwoStageDetector
+from ...registry import DETECTORS
+
+@DETECTORS.register()
+class FastRCNN(TwoStageDetector):
+
+ def __init__(self,
+ backbone,
+ head=None,
+ train_cfg=None,
+ test_cfg=None,
+ neck=None,
+ pretrained=None):
+ super(FastRCNN, self).__init__(
+ backbone=backbone,
+ neck=neck,
+ roi_head=head,
+ train_cfg=train_cfg,
+ test_cfg=test_cfg,
+ pretrained=pretrained)
diff --git a/paddlevideo/modeling/framework/detectors/two_stage.py b/paddlevideo/modeling/framework/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9deb1d0fdd70131c17d05cda147ca6b7bdcc15a
--- /dev/null
+++ b/paddlevideo/modeling/framework/detectors/two_stage.py
@@ -0,0 +1,186 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from ... import builder
+import paddle.distributed as dist
+from ...registry import DETECTORS
+from .base import BaseDetector
+
+
+@DETECTORS.register()
+class TwoStageDetector(BaseDetector):
+ """Base class for two-stage detectors. """
+
+ def __init__(self,
+ backbone,
+ neck=None,
+ rpn_head=None,
+ roi_head=None,
+ train_cfg=None,
+ test_cfg=None,
+ pretrained=None):
+ super(TwoStageDetector, self).__init__()
+ self.backbone = builder.build_backbone(backbone)
+
+ if neck is not None:
+ self.neck = neck # useless
+
+ if rpn_head is not None:
+ rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+ rpn_head_ = rpn_head.copy()
+ rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+ self.rpn_head = builder.build_head(rpn_head_)
+
+ if roi_head is not None:
+ self.roi_head = builder.build_head(roi_head)
+
+ self.train_cfg = train_cfg
+ self.test_cfg = test_cfg
+
+ if pretrained is not None:
+ self.init_weights(pretrained=pretrained)
+
+ @property
+ def with_rpn(self):
+ """whether the detector has RPN"""
+ return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+ @property
+ def with_roi_head(self):
+ """whether the detector has a RoI head"""
+ return hasattr(self, 'roi_head') and self.roi_head is not None
+
+ def init_weights(self, pretrained=None):
+ """Initialize the weights in detector. """
+ super(TwoStageDetector, self).init_weights(pretrained)
+ self.backbone.init_weights(pretrained=pretrained)
+ if self.with_rpn:
+ self.rpn_head.init_weights()
+ if self.with_roi_head:
+ self.roi_head.init_weights(pretrained)
+
+ def extract_feat(self, img):
+ """Directly extract features from the backbone."""
+ x = self.backbone(img)
+ return x
+
+ def train_step(self, data, **kwargs):
+ img_slow = data[0]
+ img_fast = data[1]
+ proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(
+ data)
+ img_shape = data[7]
+ img_idx = data[8]
+ img_metas = scores, entity_ids
+ x = self.extract_feat(img=[img_slow, img_fast])
+ roi_losses = self.roi_head.train_step(x, img_metas, proposals,
+ gt_bboxes, gt_labels, **kwargs)
+ losses = dict()
+ losses.update(roi_losses)
+
+ return losses
+
+ def val_step(self, data, rescale=False):
+ img_slow = data[0]
+ img_fast = data[1]
+ proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(
+ data)
+ img_shape = data[7]
+ img_metas = scores, entity_ids
+ x = self.extract_feat(img=[img_slow, img_fast])
+
+ return self.roi_head.simple_test(x,
+ proposals[0],
+ img_shape,
+ rescale=rescale)
+
+ def test_step(self, data, rescale=False):
+ return self.val_step(data, rescale)
+
+ def infer_step(self, data, rescale=False):
+ ''' model inference'''
+
+ img_slow = data[0]
+ img_fast = data[1]
+ proposals = data[2]
+ img_shape = data[3]
+
+ # using slowfast model to extract spatio-temporal features
+ x = self.extract_feat(img=[img_slow, img_fast])
+
+ ret = self.roi_head.simple_test(x,
+ proposals[0],
+ img_shape,
+ rescale=rescale)
+ return ret
+
+ def get_unpad_datas(self, data):
+ ''' get original datas padded in dataset '''
+ pad_proposals = data[2]
+ pad_gt_bboxes = data[3]
+ pad_gt_labels = data[4]
+ pad_scores, pad_entity_ids = data[5], data[6]
+ len_proposals = data[9]
+ len_gt_bboxes = data[10]
+ len_gt_labels = data[11]
+ len_scores = data[12]
+ len_entity_ids = data[13]
+ N = pad_proposals.shape[0]
+ proposals = []
+ gt_bboxes = []
+ gt_labels = []
+ scores = []
+ entity_ids = []
+ for bi in range(N):
+ pad_proposal = pad_proposals[bi]
+ len_proposal = len_proposals[bi]
+ index_proposal = paddle.arange(len_proposal)
+ proposal = paddle.index_select(x=pad_proposal,
+ index=index_proposal,
+ axis=0)
+ proposals.append(proposal)
+
+ pad_gt_bbox = pad_gt_bboxes[bi]
+ len_gt_bbox = len_gt_bboxes[bi]
+ index_gt_bbox = paddle.arange(len_gt_bbox)
+ gt_bbox = paddle.index_select(x=pad_gt_bbox,
+ index=index_gt_bbox,
+ axis=0)
+ gt_bboxes.append(gt_bbox)
+
+ pad_gt_label = pad_gt_labels[bi]
+ len_gt_label = len_gt_labels[bi]
+ index_gt_label = paddle.arange(len_gt_label)
+ gt_label = paddle.index_select(x=pad_gt_label,
+ index=index_gt_label,
+ axis=0)
+ gt_labels.append(gt_label)
+
+ pad_score = pad_scores[bi]
+ len_score = len_scores[bi]
+ index_score = paddle.arange(len_score)
+ score = paddle.index_select(x=pad_score, index=index_score, axis=0)
+ scores.append(score)
+
+ pad_entity_id = pad_entity_ids[bi]
+ len_entity_id = len_entity_ids[bi]
+ index_entity_id = paddle.arange(len_entity_id)
+ entity_id = paddle.index_select(x=pad_entity_id,
+ index=index_entity_id,
+ axis=0)
+ entity_ids.append(entity_id)
+
+ return proposals, gt_bboxes, gt_labels, scores, entity_ids
diff --git a/paddlevideo/modeling/framework/estimators/__init__.py b/paddlevideo/modeling/framework/estimators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bda935c2d8aff40f26e34e1f128ee6dafa7be1
--- /dev/null
+++ b/paddlevideo/modeling/framework/estimators/__init__.py
@@ -0,0 +1,4 @@
+from .base import BaseEstimator
+from .depth_estimator import DepthEstimator
+
+__all__ = ['DepthEstimator', 'BaseEstimator']
diff --git a/paddlevideo/modeling/framework/estimators/base.py b/paddlevideo/modeling/framework/estimators/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdddd674fbabf9bba136e6421770e385dfccb656
--- /dev/null
+++ b/paddlevideo/modeling/framework/estimators/base.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+
+import paddle
+import paddle.nn as nn
+from paddlevideo.modeling.registry import ESTIMATORS
+from paddlevideo.utils import get_logger
+
+from ... import builder
+
+logger = get_logger("paddlevideo")
+
+
+@ESTIMATORS.register()
+class BaseEstimator(nn.Layer):
+ """BaseEstimator
+
+ """
+ def __init__(self, backbone=None, head=None):
+ super().__init__()
+ if backbone is not None:
+ self.backbone = builder.build_backbone(backbone)
+ if hasattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ self.backbone = None
+
+ if head is not None:
+ self.head_name = head.name
+ self.head = builder.build_head(head)
+ if hasattr(self.head, 'init_weights'):
+ self.head.init_weights()
+ else:
+ self.head = None
+
+ def forward(self, data_batch, mode='infer'):
+ """
+ 1. Define how the model is going to run, from input to output.
+ 2. Console of train, valid, test or infer step
+ """
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch):
+ """Define how the model is going to train, from input to output.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch):
+ """Define how the model is going to valid, from input to output."""
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ raise NotImplementedError
+
+ @abstractmethod
+ def infer_step(self, data_batch):
+ """Define how the model is going to infer, from input to output."""
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/estimators/depth_estimator.py b/paddlevideo/modeling/framework/estimators/depth_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ee877756b02e16a9dda231d66dcf0b1af532e7
--- /dev/null
+++ b/paddlevideo/modeling/framework/estimators/depth_estimator.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+from paddlevideo.modeling.framework.estimators.base import BaseEstimator
+from paddlevideo.modeling.registry import ESTIMATORS
+from paddlevideo.utils import get_logger
+
+from ... import builder
+
+logger = get_logger("paddlevideo")
+
+
+@ESTIMATORS.register()
+class DepthEstimator(BaseEstimator):
+ """DepthEstimator
+ """
+ def forward_net(self, inputs, day_or_night='day_and_night'):
+ if self.backbone is not None:
+ outputs = self.backbone(inputs, day_or_night)
+ else:
+ outputs = inputs
+ return outputs
+
+ def train_step(self, data_batch):
+ """Define how the model is going to train, from input to output.
+ """
+ inputs, _ = data_batch
+ outputs = self.forward_net(inputs, day_or_night='day_and_night')
+ loss_metrics = self.head.loss(inputs, outputs)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ inputs, day_or_night = data_batch
+ outputs = self.forward_net(inputs, day_or_night=day_or_night)
+ loss_metrics = self.head.loss(inputs, outputs)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ inputs, day_or_night = data_batch
+ outputs = self.forward_net(inputs, day_or_night=day_or_night)
+ loss_metrics = self.head.loss(inputs, outputs)
+ return loss_metrics
+
+ def infer_step(self, data_batch):
+ """Define how the model is going to infer, from input to output."""
+ inputs = data_batch[0]
+ outputs = self.forward_net(inputs, day_or_night='day')
+ return outputs
diff --git a/paddlevideo/modeling/framework/localizers/__init__.py b/paddlevideo/modeling/framework/localizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52405444deabe42792d30d2a888b38005ee8f410
--- /dev/null
+++ b/paddlevideo/modeling/framework/localizers/__init__.py
@@ -0,0 +1,18 @@
+# copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+# http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .base import BaseLocalizer
+from .bmn_localizer import BMNLocalizer
+
+__all__ = ['BaseLocalizer', 'BMNLocalizer']
diff --git a/paddlevideo/modeling/framework/localizers/base.py b/paddlevideo/modeling/framework/localizers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfd2869f6da32ba35224fe36b347f072523d2587
--- /dev/null
+++ b/paddlevideo/modeling/framework/localizers/base.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+import paddle.nn as nn
+from ... import builder
+
+
+class BaseLocalizer(nn.Layer):
+ """Base class for Localization.
+ All localizer should subclass it.
+ All subclass should overwrite:
+ - Methods:``train_step``, define your train step.
+ - Methods:``valid_step``, define your valid step, always the same as train_step.
+ - Methods:``test_step``, define your test step.
+ """
+ def __init__(self, backbone, loss):
+ super().__init__()
+ self.backbone = builder.build_backbone(backbone)
+ self.loss = builder.build_loss(loss)
+ self.init_weights()
+
+ def init_weights(self):
+ """Initialize the model network weights. """
+ if getattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ pass
+
+ def forward(self, data_batch, mode='infer'):
+ """
+ 1. Define how the model is going to run, from input to output.
+ 2. Console of train, valid, test or infer step
+ 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+ """
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch, **kwargs):
+ """Training step. input_data_batch -> loss_metric
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch, **kwargs):
+ """Validating setp. input_data_batch -> loss_metric
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch, **kwargs):
+ """Tets setp. to get acc in test data. input_data_batch -> output
+ """
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/localizers/bmn_localizer.py b/paddlevideo/modeling/framework/localizers/bmn_localizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5afbd3a0c1b635a299d3276cce59882aa2b0bf54
--- /dev/null
+++ b/paddlevideo/modeling/framework/localizers/bmn_localizer.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import LOCALIZERS
+from .base import BaseLocalizer
+
+import paddle
+
+
+@LOCALIZERS.register()
+class BMNLocalizer(BaseLocalizer):
+ """BMN Localization framework
+ """
+ def forward_net(self, imgs):
+ """Call backbone forward.
+ """
+ preds = self.backbone(imgs)
+ return preds
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ x_data = data_batch[0]
+ gt_iou_map = data_batch[1]
+ gt_start = data_batch[2]
+ gt_end = data_batch[3]
+ gt_iou_map.stop_gradient = True
+ gt_start.stop_gradient = True
+ gt_end.stop_gradient = True
+
+ # call Model forward
+ pred_bm, pred_start, pred_end = self.forward_net(x_data)
+ # call Loss forward
+ loss = self.loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+ gt_end)
+ avg_loss = paddle.mean(loss)
+ loss_metrics = dict()
+ loss_metrics['loss'] = avg_loss
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ return self.train_step(data_batch)
+
+ def test_step(self, data_batch):
+ """Test step.
+ """
+ x_data = data_batch[0]
+ pred_bm, pred_start, pred_end = self.forward_net(x_data)
+ return pred_bm, pred_start, pred_end
+
+ def infer_step(self, data_batch):
+ """Infer step
+ """
+ x_data = data_batch[0]
+
+ # call Model forward
+ pred_bm, pred_start, pred_end = self.forward_net(x_data)
+ return pred_bm, pred_start, pred_end
diff --git a/paddlevideo/modeling/framework/multimodal/__init__.py b/paddlevideo/modeling/framework/multimodal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1efec3d776857c8a09493ac1a2ee6e151e71434
--- /dev/null
+++ b/paddlevideo/modeling/framework/multimodal/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseMultimodal
+from .actbert import ActBert
+
+__all__ = ['BaseMultimodal', 'ActBert']
diff --git a/paddlevideo/modeling/framework/multimodal/actbert.py b/paddlevideo/modeling/framework/multimodal/actbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2c074ff1b27b58ec70be3b3de1b532078792da
--- /dev/null
+++ b/paddlevideo/modeling/framework/multimodal/actbert.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import MULTIMODAL
+from .base import BaseMultimodal
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@MULTIMODAL.register()
+class ActBert(BaseMultimodal):
+ """ActBert model framework."""
+ def forward_net(self, text_ids, action_feat, image_feat, image_loc,
+ token_type_ids, text_mask, image_mask, action_mask):
+ pred = self.backbone(text_ids, action_feat, image_feat, image_loc,
+ token_type_ids, text_mask, image_mask, action_mask)
+ return pred
+
+ def train_step(self, data_batch):
+ """For ActBert Dataset. Define how the model is going to train, from input to output.
+ """
+ text_ids, action_feat, image_feat, image_loc, \
+ token_type_ids, text_mask, image_mask, action_mask, \
+ text_labels, action_label, next_sentence_label, image_label, image_target = data_batch
+ loss_metrics = dict()
+ pred = self.backbone(text_ids, action_feat, image_feat, image_loc,
+ token_type_ids, text_mask, image_mask, action_mask)
+ prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = pred
+ total_loss = self.loss(prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \
+ text_labels, image_label, image_target, action_label, next_sentence_label)
+ loss_metrics['loss'] = paddle.mean(total_loss)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """For ActBert Dataset. Define how the model is going to val, from input to output.
+ """
+ return self.train_step(data_batch)
+
+ def test_step(self, data_batch):
+ """For MSR-VTT Dataset. Define how the model is going to test, from input to output."""
+ text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask = data_batch[:
+ -1]
+ action_feat = action_feat.squeeze(0)
+ image_feat = image_feat.squeeze(0)
+ image_loc = image_loc.squeeze(0)
+ image_mask = image_mask.squeeze(0)
+ action_mask = action_mask.squeeze(0)
+ prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.forward_net(text_ids, \
+ action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask)
+ return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
+
+ def infer_step(self, data_batch):
+ pass
diff --git a/paddlevideo/modeling/framework/multimodal/base.py b/paddlevideo/modeling/framework/multimodal/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc57f9765bf247272c15c932eb319efff6c73566
--- /dev/null
+++ b/paddlevideo/modeling/framework/multimodal/base.py
@@ -0,0 +1,81 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseMultimodal(nn.Layer):
+ """Base class for Multimodal.
+
+ All Multimodal model should subclass it.
+ All subclass should overwrite:
+
+ - Methods:``train_step``, supporting to forward when training.
+ - Methods:``valid_step``, supporting to forward when validating.
+ - Methods:``test_step``, supporting to forward when testing.
+
+ Args:
+ backbone (dict): Backbone modules to extract feature.
+ head (dict): Head to process feature.
+ loss(dict): Loss function.
+
+ """
+ def __init__(self, backbone=None, head=None, loss=None):
+ super().__init__()
+ if backbone is not None:
+ self.backbone = builder.build_backbone(backbone)
+ if hasattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ self.backbone = None
+ if head is not None:
+ self.head_name = head.name
+ self.head = builder.build_head(head)
+ if hasattr(self.head, 'init_weights'):
+ self.head.init_weights()
+ else:
+ self.head = None
+ if loss is not None:
+ self.loss = builder.build_loss(loss)
+ else:
+ self.loss = None
+
+ def forward(self, data_batch, mode='infer'):
+ """
+ 1. Define how the model is going to run, from input to output.
+ 2. Console of train, valid, test or infer step
+ 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+ """
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch, **kwargs):
+ """Training step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch, **kwargs):
+ """Validating step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch, **kwargs):
+ """Test step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def infer_step(self, data_batch, **kwargs):
+ """Infer step.
+ """
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/partitioners/__init__.py b/paddlevideo/modeling/framework/partitioners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c6de50a38db12ccb166644ee82778a2a9dbebf4
--- /dev/null
+++ b/paddlevideo/modeling/framework/partitioners/__init__.py
@@ -0,0 +1,18 @@
+# copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+# http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .base import BasePartitioner
+from .transnetv2_partitioner import TransNetV2Partitioner
+
+__all__ = ['BasePartitioner', 'TransNetV2Partitioner']
diff --git a/paddlevideo/modeling/framework/partitioners/base.py b/paddlevideo/modeling/framework/partitioners/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c925975a9343a69d061d30cd4e7730f68db3f2
--- /dev/null
+++ b/paddlevideo/modeling/framework/partitioners/base.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+import paddle.nn as nn
+from ... import builder
+
+
+class BasePartitioner(nn.Layer):
+ """Base class for Partition.
+ All partitioner should subclass it.
+ All subclass should overwrite:
+ - Methods:``train_step``, define your train step.
+ - Methods:``valid_step``, define your valid step, always the same as train_step.
+ - Methods:``test_step``, define your test step.
+ """
+ def __init__(self, backbone=None, head=None):
+ super().__init__()
+ if backbone is not None:
+ self.backbone = builder.build_backbone(backbone)
+ if hasattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ self.backbone = None
+ if head is not None:
+ self.head_name = head.name
+ self.head = builder.build_head(head)
+ if hasattr(self.head, 'init_weights'):
+ self.head.init_weights()
+ else:
+ self.head = None
+
+ def init_weights(self):
+ """Initialize the model network weights. """
+ if getattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ pass
+
+ def forward(self, data_batch, mode='infer'):
+ """
+ 1. Define how the model is going to run, from input to output.
+ 2. Console of train, valid, test or infer step
+ 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+ """
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch, **kwargs):
+ """Training step. input_data_batch -> loss_metric
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch, **kwargs):
+ """Validating setp. input_data_batch -> loss_metric
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch, **kwargs):
+ """Tets setp. to get acc in test data. input_data_batch -> output
+ """
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py b/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3295068cdb39657bcd9c1b05817417b2061f69a
--- /dev/null
+++ b/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import PARTITIONERS
+from .base import BasePartitioner
+
+import paddle
+
+
+@PARTITIONERS.register()
+class TransNetV2Partitioner(BasePartitioner):
+ """TransNetV2 Partitioner framework
+ """
+ def forward_net(self, imgs):
+ one_hot_pred = self.backbone(imgs)
+ return one_hot_pred
+
+ def train_step(self, data_batch):
+ """Define how the model is going to train, from input to output.
+ """
+ frame_sequence = data_batch[0]
+ one_hot_gt, many_hot_gt = data_batch[1:]
+ one_hot_pred = self.forward_net(frame_sequence)
+ dict_ = {}
+ if isinstance(one_hot_pred, tuple):
+ one_hot_pred, dict_ = one_hot_pred
+ many_hot_pred = dict_.get("many_hot", None)
+ comb_reg_loss = dict_.get("comb_reg_loss", None)
+ loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,
+ many_hot_pred, many_hot_gt,
+ reg_losses={"comb_reg": comb_reg_loss})
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ frame_sequence = data_batch[0]
+ one_hot_gt, many_hot_gt = data_batch[1:]
+ one_hot_pred = self.forward_net(frame_sequence)
+ dict_ = {}
+ if isinstance(one_hot_pred, tuple):
+ one_hot_pred, dict_ = one_hot_pred
+ many_hot_pred = dict_.get("many_hot", None)
+ comb_reg_loss = dict_.get("comb_reg_loss", None)
+ loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,
+ many_hot_pred, many_hot_gt,
+ reg_losses={"comb_reg": comb_reg_loss})
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+ frame_sequence = data_batch[0]
+ one_hot_pred = self.forward_net(frame_sequence)
+ return one_hot_pred
+
+ def infer_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ frame_sequence = data_batch[0]
+ one_hot_pred = self.forward_net(frame_sequence)
+ return one_hot_pred
diff --git a/paddlevideo/modeling/framework/recognizers/__init__.py b/paddlevideo/modeling/framework/recognizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9ee1d656ff5d980c54ee101f286a71402ba3e1
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseRecognizer
+from .recognizer1d import Recognizer1D
+from .recognizer2d import Recognizer2D
+from .recognizer3d import Recognizer3D
+from .recognizer_transformer import RecognizerTransformer
+from .recognizer_gcn import RecognizerGCN
+from .recognizerMRI import RecognizerMRI
+from .recognizer3dMRI import Recognizer3DMRI
+from .recognizer_transformer_MRI import RecognizerTransformer_MRI
+from .recognizer_movinet_frame import MoViNetRecognizerFrame
+
+__all__ = [
+ 'BaseRecognizer', 'Recognizer1D', 'Recognizer2D', 'Recognizer3D',
+ 'RecognizerTransformer', 'RecognizerGCN', 'RecognizerMRI',
+ 'Recognizer3DMRI', 'RecognizerTransformer_MRI', 'MoViNetRecognizerFrame'
+]
diff --git a/paddlevideo/modeling/framework/recognizers/base.py b/paddlevideo/modeling/framework/recognizers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf31caf04cb82472c579780d9575f2edf4c53e0e
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/base.py
@@ -0,0 +1,81 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseRecognizer(nn.Layer):
+ """Base class for recognizers.
+
+ All recognizers should subclass it.
+ All subclass should overwrite:
+
+ - Methods:``train_step``, supporting to forward when training.
+ - Methods:``valid_step``, supporting to forward when validating.
+ - Methods:``test_step``, supporting to forward when testing.
+
+ Args:
+ backbone (dict): Backbone modules to extract feature.
+ head (dict): Classification head to process feature.
+
+ """
+ def __init__(self, backbone=None, head=None, runtime_cfg=None):
+
+ super().__init__()
+ if backbone is not None:
+ self.backbone = builder.build_backbone(backbone)
+ if hasattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ self.backbone = None
+ if head is not None:
+ self.head_name = head.name
+ self.head = builder.build_head(head)
+ if hasattr(self.head, 'init_weights'):
+ self.head.init_weights()
+ else:
+ self.head = None
+
+ # Settings when the model is running,
+ # such as 'avg_type'
+ self.runtime_cfg = runtime_cfg
+
+ def forward(self, data_batch, mode='infer'):
+ """
+ 1. Define how the model is going to run, from input to output.
+ 2. Console of train, valid, test or infer step
+ 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+ """
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch, **kwargs):
+ """Training step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch, **kwargs):
+ """Validating step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch, **kwargs):
+ """Test step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def infer_step(self, data_batch, **kwargs):
+ """Infer step.
+ """
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer1d.py b/paddlevideo/modeling/framework/recognizers/recognizer1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3927b181efc71620860d56c6155778bb793673e8
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer1d.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+
+@RECOGNIZERS.register()
+class Recognizer1D(BaseRecognizer):
+ """1D recognizer model framework."""
+ def forward_net(self, imgs):
+ """Define how the model is going to train, from input to output.
+ """
+ lstm_logit, lstm_output = self.head(imgs)
+ return lstm_logit, lstm_output
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels = data_batch
+ imgs = [(rgb_data, rgb_len, rgb_mask),
+ (audio_data, audio_len, audio_mask)]
+
+ # call forward
+ lstm_logit, lstm_output = self.forward_net(imgs)
+ loss = self.head.loss(lstm_logit, labels)
+ hit_at_one, perr, gap = self.head.metric(lstm_output, labels)
+ loss_metrics = dict()
+ loss_metrics['loss'] = loss
+ loss_metrics['hit_at_one'] = hit_at_one
+ loss_metrics['perr'] = perr
+ loss_metrics['gap'] = gap
+
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ return self.train_step(data_batch)
+
+ def test_step(self, data_batch):
+ """Testing setp.
+ """
+ return self.train_step(data_batch)
+
+ def infer_step(self, data_batch):
+ """Infering setp.
+ """
+ rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch
+ imgs = [(rgb_data, rgb_len, rgb_mask),
+ (audio_data, audio_len, audio_mask)]
+ # call forward
+ lstm_logit, _ = self.forward_net(imgs)
+ return lstm_logit
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer2d.py b/paddlevideo/modeling/framework/recognizers/recognizer2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8aa6619f0f78754b3f1bc85139625f1fe61e99f
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer2d.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer2D(BaseRecognizer):
+ """2D recognizer model framework."""
+ def forward_net(self, imgs):
+ # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.
+ num_segs = imgs.shape[
+ 1] # imgs.shape=[N,T,C,H,W], for most commonly case
+ imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))
+
+ if self.backbone is not None:
+ feature = self.backbone(imgs)
+ else:
+ feature = imgs
+
+ if self.head is not None:
+ cls_score = self.head(feature, num_segs)
+ else:
+ cls_score = None
+
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Define how the model is going to train, from input to output.
+ """
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ loss_metrics = self.head.loss(cls_score, labels)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+ imgs = data_batch[0]
+ cls_score = self.forward_net(imgs)
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ imgs = data_batch[0]
+ cls_score = self.forward_net(imgs)
+ return cls_score
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer3d.py b/paddlevideo/modeling/framework/recognizers/recognizer3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fdabf58c65babc0ce896a0a25996c9f9dd3e240
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer3d.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer3D(BaseRecognizer):
+ """3D Recognizer model framework.
+ """
+ def forward_net(self, imgs):
+ """Define how the model is going to run, from input to output.
+ """
+ feature = self.backbone(imgs)
+ cls_score = self.head(feature)
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ imgs = data_batch[0:2]
+ labels = data_batch[2:]
+
+ # call forward
+ cls_score = self.forward_net(imgs)
+ loss_metrics = self.head.loss(cls_score, labels)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ imgs = data_batch[0:2]
+ labels = data_batch[2:]
+
+ # call forward
+ cls_score = self.forward_net(imgs)
+ loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Test step.
+ """
+ imgs = data_batch[0:2]
+ # call forward
+ cls_score = self.forward_net(imgs)
+
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Infer step.
+ """
+ imgs = data_batch[0:2]
+ # call forward
+ cls_score = self.forward_net(imgs)
+
+ return cls_score
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py b/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..9298491c0e9ab445be2dd56c853c9dfee12ec8af
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+import paddle
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer3DMRI(BaseRecognizer):
+ """3D Recognizer model framework.
+ """
+ def forward_net(self, imgs):
+ """Define how the model is going to run, from input to output.
+ """
+
+ imgs[0] = paddle.cast(imgs[0], "float32")
+ imgs[1] = paddle.cast(imgs[1], "float32")
+ imgs[0] = imgs[0].unsqueeze(1)
+ imgs[1] = imgs[1].unsqueeze(1)
+
+ feature = self.backbone(imgs)
+ cls_score = self.head(feature)
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ imgs = data_batch[0:2]
+ labels = data_batch[2:]
+
+ # call forward
+ cls_score = self.forward_net(imgs)
+ cls_score = paddle.nn.functional.sigmoid(cls_score)
+ loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ imgs = data_batch[0:2]
+ labels = data_batch[2:]
+
+ # call forward
+ cls_score = self.forward_net(imgs)
+ cls_score = paddle.nn.functional.sigmoid(cls_score)
+ loss_metrics = self.head.loss(cls_score,
+ labels,
+ valid_mode=True,
+ if_top5=False)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Test step.
+ """
+ imgs = data_batch[0:2]
+ # call forward
+ cls_score = self.forward_net(imgs)
+
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Infer step.
+ """
+ imgs = data_batch[0:2]
+ # call forward
+ cls_score = self.forward_net(imgs)
+
+ return cls_score
diff --git a/paddlevideo/modeling/framework/recognizers/recognizerMRI.py b/paddlevideo/modeling/framework/recognizers/recognizerMRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1713e61bf23da23c0cc84174bbd211a4e19025
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizerMRI.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerMRI(BaseRecognizer):
+ """2D recognizer model framework."""
+ def forward_net(self, imgs):
+ # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.
+ num_segs = imgs.shape[
+ 1] # imgs.shape=[N,T,C,H,W], for most commonly case
+ imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))
+ imgs = paddle.cast(imgs, "float32") #############
+ imgs = imgs.unsqueeze(1)
+
+ if self.backbone != None:
+ feature = self.backbone(imgs)
+ else:
+ feature = imgs
+
+ if self.head != None:
+ cls_score = self.head(feature, num_segs)
+ else:
+ cls_score = None
+
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Define how the model is going to train, from input to output.
+ """
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ cls_score = paddle.nn.functional.sigmoid(cls_score)
+ loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ cls_score = paddle.nn.functional.sigmoid(cls_score)
+ loss_metrics = self.head.loss(cls_score,
+ labels,
+ valid_mode=True,
+ if_top5=False)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+ imgs = data_batch[0]
+ cls_score = self.forward_net(imgs)
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Define how the model is going to test, from input to output."""
+ imgs = data_batch[0]
+ cls_score = self.forward_net(imgs)
+ return cls_score
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py b/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..281c5ac9e2bc9dd4341cc43548bd2fe14e04d518
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerGCN(BaseRecognizer):
+ """GCN Recognizer model framework.
+ """
+
+ def __init__(self,
+ backbone=None,
+ head=None,
+ runtime_cfg=None,
+ if_top5=True):
+ """
+ Args:
+ backbone (dict): Backbone modules to extract feature.
+ head (dict): Classification head to process feature.
+ is_top5 (bool): Whether to display top-5 accuracy during training/validation steps.
+ """
+ super(RecognizerGCN, self).__init__(backbone, head, runtime_cfg)
+ self.if_top5 = if_top5
+
+ def forward_net(self, data):
+ """Define how the model is going to run, from input to output.
+ """
+ feature = self.backbone(data)
+ cls_score = self.head(feature)
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ data = data_batch[0]
+ label = data_batch[1:]
+
+ # call forward
+ cls_score = self.forward_net(data)
+ loss_metrics = self.head.loss(cls_score, label, if_top5=self.if_top5)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ data = data_batch[0]
+ label = data_batch[1:]
+
+ # call forward
+ cls_score = self.forward_net(data)
+ loss_metrics = self.head.loss(cls_score,
+ label,
+ valid_mode=True,
+ if_top5=self.if_top5)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Test step.
+ """
+ data = data_batch[0]
+
+ # call forward
+ cls_score = self.forward_net(data)
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Infer step.
+ """
+ data = data_batch[0]
+
+ # call forward
+ cls_score = self.forward_net(data)
+ return cls_score
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py b/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad2e149a3d67e85abf45c548034567a398e4ce8
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+
+from paddlevideo.utils import get_logger
+from .base import BaseRecognizer
+from ...registry import RECOGNIZERS
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class MoViNetRecognizerFrame(BaseRecognizer):
+
+ def forward_net(self, imgs):
+ """Define how the model is going to run, from input to output.
+ """
+ self.backbone.clean_activation_buffers()
+ outputs = self.backbone(imgs)
+ cls_score = self.head(outputs)
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ imgs = data_batch[0]
+ labels = data_batch[1] #.astype("int64")
+ data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+ # call forward
+ cls_score = self.forward_net(data)
+ loss_metrics = self.head.loss_func(cls_score, labels)
+ top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)
+ top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)
+ output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}
+ return output
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ imgs = data_batch[0]
+ labels = data_batch[1] #.astype("int64")
+ data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+ # call forward
+ cls_score = self.forward_net(data)
+ loss_metrics = self.head.loss_func(cls_score, labels)
+ top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)
+ top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)
+ output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}
+ return output
+
+ def test_step(self, data_batch):
+ """Test step.
+ """
+ imgs = data_batch[0]
+ data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+ # call forward
+ cls_score = self.forward_net(data)
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Infer step.
+ """
+ imgs = data_batch[0]
+ # call forward
+ data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+ cls_score = self.forward_net(data)
+
+ return cls_score
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py b/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4144edacfbbf4ea569498f741b6c8b0ff07ed7b1
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerTransformer(BaseRecognizer):
+ """Transformer's recognizer model framework."""
+ def forward_net(self, imgs):
+ # imgs.shape=[N,C,T,H,W], for transformer case
+ if self.backbone is not None:
+ feature = self.backbone(imgs)
+ else:
+ feature = imgs
+
+ if self.head is not None:
+ cls_score = self.head(feature)
+ else:
+ cls_score = None
+
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Define how the model is going to train, from input to output.
+ """
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ loss_metrics = self.head.loss(cls_score, labels)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Define how the model is going to infer, from input to output."""
+ imgs = data_batch[0]
+ num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg
+ cls_score = []
+ for i in range(num_views):
+ view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *
+ self.runtime_cfg.test.num_seg]
+ cls_score.append(self.forward_net(view))
+ cls_score = self._average_view(cls_score,
+ self.runtime_cfg.test.avg_type)
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Define how the model is going to infer, from input to output."""
+ imgs = data_batch[0]
+ num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg
+ cls_score = []
+ for i in range(num_views):
+ view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *
+ self.runtime_cfg.test.num_seg]
+ cls_score.append(self.forward_net(view))
+ cls_score = self._average_view(cls_score,
+ self.runtime_cfg.test.avg_type)
+ return cls_score
+
+ def _average_view(self, cls_score, avg_type='score'):
+ """Combine the predicted results of different views
+
+ Args:
+ cls_score (list): results of multiple views
+ avg_type (str, optional): Average calculation method. Defaults to 'score'.
+ """
+ assert avg_type in ['score', 'prob'], \
+ f"Currently only the average of 'score' or 'prob' is supported, but got {avg_type}"
+ if avg_type == 'score':
+ return paddle.add_n(cls_score) / len(cls_score)
+ elif avg_type == 'prob':
+ return paddle.add_n(
+ [F.softmax(score, axis=-1)
+ for score in cls_score]) / len(cls_score)
+ else:
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py b/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8696b4da591d5d5f3bdedf1dd9fc29d8ad5710b
--- /dev/null
+++ b/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerTransformer_MRI(BaseRecognizer):
+ """Transformer's recognizer model framework."""
+ def forward_net(self, imgs):
+ # imgs.shape=[N,C,T,H,W], for transformer case
+
+ imgs = paddle.cast(imgs, "float32") #############
+ imgs = imgs.unsqueeze(1)
+
+ if self.backbone != None:
+ feature = self.backbone(imgs)
+ else:
+ feature = imgs
+
+ if self.head != None:
+ cls_score = self.head(feature)
+ else:
+ cls_score = None
+
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Define how the model is going to train, from input to output.
+ """
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ cls_score = paddle.nn.functional.sigmoid(cls_score)
+ loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ imgs = data_batch[0]
+ labels = data_batch[1:]
+ cls_score = self.forward_net(imgs)
+ cls_score = paddle.nn.functional.sigmoid(cls_score)
+ loss_metrics = self.head.loss(cls_score,
+ labels,
+ valid_mode=True,
+ if_top5=False)
+ return loss_metrics
+
+ def test_step(self, data_batch):
+ """Define how the model is going to infer, from input to output."""
+ imgs = data_batch[0]
+ num_views = imgs.shape[2] // self.backbone.seg_num
+ cls_score = []
+ for i in range(num_views):
+ view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *
+ self.backbone.seg_num]
+ cls_score.append(self.forward_net(view))
+ cls_score = self.average_view(cls_score)
+ return cls_score
+
+ def infer_step(self, data_batch):
+ """Define how the model is going to infer, from input to output."""
+ imgs = data_batch[0]
+ num_views = imgs.shape[2] // self.backbone.seg_num
+ cls_score = []
+ for i in range(num_views):
+ view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *
+ self.backbone.seg_num]
+ cls_score.append(self.forward_net(view))
+ cls_score = self.average_view(cls_score)
+ return cls_score
+
+ def average_view(self, cls_score, average_type='score'):
+ """Combine the scores of different views
+
+ Args:
+ cls_score (list): Scores of multiple views
+ average_type (str, optional): Average calculation method. Defaults to 'score'.
+ """
+ assert average_type in ['score', 'prob'], \
+ f"Currently only the average of 'score' or 'prob' is supported, but got {average_type}"
+ if average_type == 'score':
+ return paddle.add_n(cls_score) / len(cls_score)
+ elif average_type == 'avg':
+ return paddle.add_n([F.softmax(score)
+ for score in cls_score]) / len(cls_score)
+ else:
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/segment/__init__.py b/paddlevideo/modeling/framework/segment/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..28a1d2e15a23814cca03e21d515124fba25e3ae9
--- /dev/null
+++ b/paddlevideo/modeling/framework/segment/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseSegment
+from .cfbi import CFBI
+
+__all__ = ['BaseSegment', 'CFBI']
diff --git a/paddlevideo/modeling/framework/segment/base.py b/paddlevideo/modeling/framework/segment/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c5cb07f76a100958d21d88e8c0c795489f34497
--- /dev/null
+++ b/paddlevideo/modeling/framework/segment/base.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseSegment(nn.Layer):
+ """Base class for semi-Video Object Segmentation.
+ All subclass should overwrite:
+
+ - Methods:``train_step``, supporting to forward when training.
+ - Methods:``valid_step``, supporting to forward when validating.
+ - Methods:``test_step``, supporting to forward when testing.
+
+ Args:
+ backbone (dict): Backbone modules to extract feature.
+ head (dict): Head to process feature.
+ loss(dict): Loss function.
+ """
+ def __init__(self, backbone=None, head=None, loss=None):
+ super().__init__()
+ if backbone is not None:
+ self.backbone = builder.build_backbone(backbone)
+ if hasattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ self.backbone = None
+ if head is not None:
+ self.head_name = head.name
+ self.head = builder.build_head(head)
+ if hasattr(self.head, 'init_weights'):
+ self.head.init_weights()
+ else:
+ self.head = None
+ if loss is not None:
+ self.loss = builder.build_loss(loss)
+ else:
+ self.loss = None
+
+ def forward(self, data_batch, mode='infer'):
+ """
+ 1. Define how the model is going to run, from input to output.
+ 2. Console of train, valid, test or infer step
+ 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+ """
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch, **kwargs):
+ """Training step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch, **kwargs):
+ """Validating step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch, **kwargs):
+ """Test step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def infer_step(self, data_batch, **kwargs):
+ """Infer step.
+ """
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/segment/cfbi.py b/paddlevideo/modeling/framework/segment/cfbi.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc512f032a57ee11c5416aa0eb9a1b322e8bbc
--- /dev/null
+++ b/paddlevideo/modeling/framework/segment/cfbi.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from .utils import foreground2background, global_matching_for_eval, local_matching, calculate_attention_head_for_eval
+from ...registry import SEGMENT
+from .base import BaseSegment
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@SEGMENT.register()
+class CFBI(BaseSegment):
+ """CFBI model framework."""
+ def __init__(self, backbone=None, head=None, loss=None):
+ super().__init__(backbone, head, loss)
+ x1 = paddle.zeros([3, 1, 1, 1])
+ self.bg_bias = paddle.create_parameter(
+ shape=x1.shape,
+ dtype=x1.dtype,
+ default_initializer=nn.initializer.Assign(x1))
+ self.fg_bias = paddle.create_parameter(
+ shape=x1.shape,
+ dtype=x1.dtype,
+ default_initializer=nn.initializer.Assign(x1))
+ self.epsilon = 1e-05
+
+ def test_step(self, data_batch):
+ """Define how the model is going to test, from input to output.
+ """
+ self.test_mode = True
+ ref_embeddings, ref_masks, prev_embedding, prev_mask, current_frame, pred_size, gt_ids = data_batch
+ current_frame_embedding_4x, current_frame_embedding_8x, current_frame_embedding_16x, \
+ current_low_level = self.backbone(current_frame)
+
+ current_frame_embedding = [
+ current_frame_embedding_4x, current_frame_embedding_8x,
+ current_frame_embedding_16x
+ ]
+
+ if prev_embedding is None:
+ return None, current_frame_embedding
+ else:
+ bs, c, h, w = current_frame_embedding_4x.shape
+
+ tmp_dic, _ = self.before_seghead_process(
+ ref_embeddings,
+ prev_embedding,
+ current_frame_embedding,
+ ref_masks,
+ prev_mask,
+ gt_ids,
+ current_low_level=current_low_level,
+ )
+ all_pred = []
+ for i in range(bs):
+ pred = tmp_dic[i]
+
+ pred = F.interpolate(pred,
+ size=[pred_size[0], pred_size[1]],
+ mode='bilinear',
+ align_corners=True)
+ all_pred.append(pred)
+ all_pred = paddle.concat(all_pred, axis=0)
+ all_pred = F.softmax(all_pred, axis=1)
+ return all_pred, current_frame_embedding
+
+ def before_seghead_process(self,
+ ref_frame_embeddings=None,
+ previous_frame_embeddings=None,
+ current_frame_embeddings=None,
+ ref_frame_labels=None,
+ previous_frame_mask=None,
+ gt_ids=None,
+ current_low_level=None):
+ """ process befor segmentation head"""
+ TEST_GLOBAL_MATCHING_CHUNK = [4, 1, 1]
+ TEST_GLOBAL_ATROUS_RATE = [2, 1, 1]
+ TRAIN_LOCAL_ATROUS_RATE = [2, 1, 1]
+ TEST_LOCAL_ATROUS_RATE = [2, 1, 1]
+ MODEL_FLOAT16_MATCHING = False
+ TEST_GLOBAL_MATCHING_MIN_PIXEL = 100
+ MODEL_MULTI_LOCAL_DISTANCE = [[4, 8, 12, 16, 20, 24],
+ [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]]
+ TRAIN_LOCAL_PARALLEL = True
+ TEST_LOCAL_PARALLEL = True
+ MODEL_MATCHING_BACKGROUND = True
+ MODEL_SEMANTIC_MATCHING_DIM = [32, 64, 128]
+
+ dic_tmp = []
+ boards = {}
+ scale_ref_frame_labels = []
+ scale_previous_frame_labels = []
+ for current_frame_embedding in current_frame_embeddings:
+ bs, c, h, w = current_frame_embedding.shape
+ if not self.test_mode:
+ raise NotImplementedError
+ else:
+ ref_frame_embeddings = list(zip(*ref_frame_embeddings))
+ all_scale_ref_frame_label = []
+ for ref_frame_label in ref_frame_labels:
+ scale_ref_frame_label = paddle.cast(F.interpolate(
+ paddle.cast(ref_frame_label, dtype="float32"),
+ size=(h, w),
+ mode='nearest'),
+ dtype="int32")
+ all_scale_ref_frame_label.append(scale_ref_frame_label)
+ scale_ref_frame_labels.append(all_scale_ref_frame_label)
+ scale_previous_frame_label = paddle.cast(F.interpolate(
+ paddle.cast(previous_frame_mask, dtype="float32"),
+ size=(h, w),
+ mode='nearest'),
+ dtype="int32")
+ scale_previous_frame_labels.append(scale_previous_frame_label)
+ for n in range(bs):
+ ref_obj_ids = paddle.reshape(
+ paddle.cast(paddle.arange(0,
+ np.array(gt_ids)[n] + 1),
+ dtype="int32"), [-1, 1, 1, 1])
+ obj_num = ref_obj_ids.shape[0]
+ low_level_feat = paddle.unsqueeze(current_low_level[n], axis=0)
+ all_CE_input = []
+ all_attention_head = []
+ for scale_idx, current_frame_embedding, ref_frame_embedding, previous_frame_embedding, \
+ scale_ref_frame_label, scale_previous_frame_label in zip(range(3), \
+ current_frame_embeddings, ref_frame_embeddings, previous_frame_embeddings, \
+ scale_ref_frame_labels, scale_previous_frame_labels):
+ #Prepare
+ seq_current_frame_embedding = current_frame_embedding[n]
+ seq_prev_frame_embedding = previous_frame_embedding[n]
+ seq_previous_frame_label = paddle.cast(
+ (paddle.cast(scale_previous_frame_label[n], dtype="int32")
+ == ref_obj_ids),
+ dtype="float32")
+ if np.array(gt_ids)[n] > 0:
+ dis_bias = paddle.concat([
+ paddle.unsqueeze(self.bg_bias[scale_idx], axis=0),
+ paddle.expand(
+ paddle.unsqueeze(self.fg_bias[scale_idx], axis=0),
+ [np.array(gt_ids)[n], -1, -1, -1])
+ ],
+ axis=0)
+ else:
+ dis_bias = paddle.unsqueeze(self.bg_bias[scale_idx], axis=0)
+ #Global FG map
+ matching_dim = MODEL_SEMANTIC_MATCHING_DIM[scale_idx]
+ seq_current_frame_embedding_for_matching = paddle.transpose(
+ seq_current_frame_embedding[:matching_dim], [1, 2, 0])
+
+ if not self.test_mode:
+ raise NotImplementedError
+ else:
+ all_scale_ref_frame_label = scale_ref_frame_label
+ all_ref_frame_embedding = ref_frame_embedding
+ all_reference_embeddings = []
+ all_reference_labels = []
+ seq_ref_frame_labels = []
+ count = 0
+ for idx in range(len(all_scale_ref_frame_label)):
+
+ ref_frame_embedding = all_ref_frame_embedding[idx]
+ scale_ref_frame_label = all_scale_ref_frame_label[idx]
+
+ seq_ref_frame_embedding = ref_frame_embedding[n]
+ seq_ref_frame_embedding = paddle.transpose(
+ seq_ref_frame_embedding, [1, 2, 0])
+ seq_ref_frame_label = paddle.cast(
+ (paddle.cast(scale_ref_frame_label[n],
+ dtype="int32") == ref_obj_ids),
+ dtype="float32")
+ seq_ref_frame_labels.append(seq_ref_frame_label)
+ seq_ref_frame_label = paddle.transpose(
+ paddle.squeeze(seq_ref_frame_label, axis=1),
+ [1, 2, 0])
+ all_reference_embeddings.append(
+ seq_ref_frame_embedding[:, :, :matching_dim])
+ all_reference_labels.append(seq_ref_frame_label)
+ global_matching_fg = global_matching_for_eval(
+ all_reference_embeddings=all_reference_embeddings,
+ query_embeddings=
+ seq_current_frame_embedding_for_matching,
+ all_reference_labels=all_reference_labels,
+ n_chunks=TEST_GLOBAL_MATCHING_CHUNK[scale_idx],
+ dis_bias=dis_bias,
+ atrous_rate=TEST_GLOBAL_ATROUS_RATE[scale_idx],
+ use_float16=MODEL_FLOAT16_MATCHING,
+ atrous_obj_pixel_num=TEST_GLOBAL_MATCHING_MIN_PIXEL)
+
+ # Local FG map
+ seq_prev_frame_embedding_for_matching = paddle.transpose(
+ seq_prev_frame_embedding[:matching_dim], [1, 2, 0])
+ seq_previous_frame_label_for_matching = paddle.transpose(
+ paddle.squeeze(seq_previous_frame_label, axis=1), [1, 2, 0])
+ local_matching_fg = local_matching(
+ prev_frame_embedding=seq_prev_frame_embedding_for_matching,
+ query_embedding=seq_current_frame_embedding_for_matching,
+ prev_frame_labels=seq_previous_frame_label_for_matching,
+ multi_local_distance=MODEL_MULTI_LOCAL_DISTANCE[scale_idx],
+ dis_bias=dis_bias,
+ atrous_rate=TRAIN_LOCAL_ATROUS_RATE[scale_idx] if
+ not self.test_mode else TEST_LOCAL_ATROUS_RATE[scale_idx],
+ use_float16=MODEL_FLOAT16_MATCHING,
+ allow_downsample=False,
+ allow_parallel=TRAIN_LOCAL_PARALLEL
+ if not self.test_mode else TEST_LOCAL_PARALLEL)
+
+ #Aggregate Pixel-level Matching
+ to_cat_global_matching_fg = paddle.transpose(
+ paddle.squeeze(global_matching_fg, axis=0), [2, 3, 0, 1])
+ to_cat_local_matching_fg = paddle.transpose(
+ paddle.squeeze(local_matching_fg, axis=0), [2, 3, 0, 1])
+ all_to_cat = [
+ to_cat_global_matching_fg, to_cat_local_matching_fg,
+ seq_previous_frame_label
+ ]
+
+ #Global and Local BG map
+ if MODEL_MATCHING_BACKGROUND:
+ to_cat_global_matching_bg = foreground2background(
+ to_cat_global_matching_fg,
+ np.array(gt_ids)[n] + 1)
+ reshaped_prev_nn_feature_n = paddle.unsqueeze(
+ paddle.transpose(to_cat_local_matching_fg,
+ [0, 2, 3, 1]),
+ axis=1)
+ to_cat_local_matching_bg = foreground2background(
+ reshaped_prev_nn_feature_n,
+ np.array(gt_ids)[n] + 1)
+ to_cat_local_matching_bg = paddle.squeeze(paddle.transpose(
+ to_cat_local_matching_bg, [0, 4, 2, 3, 1]),
+ axis=-1)
+ all_to_cat += [
+ to_cat_local_matching_bg, to_cat_global_matching_bg
+ ]
+
+ to_cat_current_frame_embedding = paddle.expand(
+ paddle.unsqueeze(current_frame_embedding[n], axis=0),
+ [obj_num, -1, -1, -1])
+ to_cat_prev_frame_embedding = paddle.expand(
+ paddle.unsqueeze(previous_frame_embedding[n], axis=0),
+ [obj_num, -1, -1, -1])
+ to_cat_prev_frame_embedding_fg = to_cat_prev_frame_embedding * seq_previous_frame_label
+ to_cat_prev_frame_embedding_bg = to_cat_prev_frame_embedding * (
+ 1 - seq_previous_frame_label)
+ all_to_cat += [
+ to_cat_current_frame_embedding,
+ to_cat_prev_frame_embedding_fg,
+ to_cat_prev_frame_embedding_bg
+ ]
+
+ CE_input = paddle.concat(all_to_cat, axis=1)
+ #Instance-level Attention
+ if not self.test_mode:
+ raise NotImplementedError
+ else:
+ attention_head = calculate_attention_head_for_eval(
+ all_ref_frame_embedding,
+ seq_ref_frame_labels,
+ paddle.expand(
+ paddle.unsqueeze(previous_frame_embedding[n],
+ axis=0), [obj_num, -1, -1, -1]),
+ seq_previous_frame_label,
+ epsilon=self.epsilon)
+
+ all_CE_input.append(CE_input)
+ all_attention_head.append(attention_head)
+
+ #Collaborative Ensembler
+ pred = self.head(all_CE_input, all_attention_head, low_level_feat)
+ dic_tmp.append(pred)
+
+ return dic_tmp, boards
diff --git a/paddlevideo/modeling/framework/segment/utils.py b/paddlevideo/modeling/framework/segment/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec3be4d2ec1c41058d75ed54bd5c72a7f894487
--- /dev/null
+++ b/paddlevideo/modeling/framework/segment/utils.py
@@ -0,0 +1,754 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def foreground2background(dis, obj_num):
+ if obj_num == 1:
+ return dis
+ bg_dis = []
+ for i in range(obj_num):
+ obj_back = []
+ for j in range(obj_num):
+ if i == j:
+ continue
+ obj_back.append(paddle.unsqueeze(dis[j], axis=0))
+ obj_back = paddle.concat(x=obj_back, axis=1)
+ obj_back = paddle.min(x=obj_back, axis=1, keepdim=True)
+ bg_dis.append(obj_back)
+ bg_dis = paddle.concat(x=bg_dis, axis=0)
+ return bg_dis
+
+
+WRONG_LABEL_PADDING_DISTANCE = 5e4
+
+
+#GLOBAL_DIST_MAP
+def _pairwise_distances(x, x2, y, y2):
+ """
+ Computes pairwise squared l2 distances between tensors x and y.
+ Args:
+ x: [n, feature_dim].
+ y: [m, feature_dim].
+ Returns:
+ d: [n, m].
+ """
+ xs = x2
+ ys = y2
+
+ xs = paddle.unsqueeze(xs, axis=1)
+ ys = paddle.unsqueeze(ys, axis=0)
+ d = xs + ys - 2. * paddle.matmul(x, y, transpose_y=True)
+ return d
+
+
+def _flattened_pairwise_distances(reference_embeddings, ref_square,
+ query_embeddings, query_square):
+ """
+ Calculates flattened tensor of pairwise distances between ref and query.
+ Args:
+ reference_embeddings: [..., embedding_dim],
+ the embedding vectors for the reference frame
+ query_embeddings: [..., embedding_dim],
+ the embedding vectors for the query frames.
+ Returns:
+ dists: [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim]
+ """
+ dists = _pairwise_distances(query_embeddings, query_square,
+ reference_embeddings, ref_square)
+ return dists
+
+
+def _nn_features_per_object_for_chunk(reference_embeddings, ref_square,
+ query_embeddings, query_square,
+ wrong_label_mask):
+ """Extracts features for each object using nearest neighbor attention.
+ Args:
+ reference_embeddings: [n_chunk, embedding_dim],
+ the embedding vectors for the reference frame.
+ query_embeddings: [m_chunk, embedding_dim],
+ the embedding vectors for the query frames.
+ wrong_label_mask: [n_objects, n_chunk],
+ the mask for pixels not used for matching.
+ Returns:
+ nn_features: A float32 tensor of nearest neighbor features of shape
+ [m_chunk, n_objects, n_chunk].
+ """
+ if reference_embeddings.dtype == "float16":
+ wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float16")
+ else:
+ wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float32")
+
+ reference_embeddings_key = reference_embeddings
+ query_embeddings_key = query_embeddings
+ dists = _flattened_pairwise_distances(reference_embeddings_key, ref_square,
+ query_embeddings_key, query_square)
+ dists = (paddle.unsqueeze(dists, axis=1) +
+ paddle.unsqueeze(wrong_label_mask, axis=0) *
+ WRONG_LABEL_PADDING_DISTANCE)
+ features = paddle.min(dists, axis=2, keepdim=True)
+ return features
+
+
+def _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,
+ query_embeddings_flat,
+ reference_labels_flat,
+ n_chunks):
+ """Calculates the nearest neighbor features per object in chunks to save mem.
+ Uses chunking to bound the memory use.
+ Args:
+ reference_embeddings_flat: [n, embedding_dim],
+ the embedding vectors for the reference frame.
+ query_embeddings_flat: [m, embedding_dim],
+ the embedding vectors for the query frames.
+ reference_labels_flat: [n, n_objects],
+ the class labels of the reference frame.
+ n_chunks: Integer, the number of chunks to use to save memory
+ (set to 1 for no chunking).
+ Returns:
+ nn_features: [m, n_objects, n].
+ """
+
+ feature_dim, embedding_dim = query_embeddings_flat.shape
+ chunk_size = int(np.ceil(float(feature_dim) / n_chunks))
+ wrong_label_mask = reference_labels_flat < 0.1
+
+ wrong_label_mask = paddle.transpose(x=wrong_label_mask, perm=[1, 0])
+ ref_square = paddle.sum(paddle.pow(reference_embeddings_flat, 2), axis=1)
+ query_square = paddle.sum(paddle.pow(query_embeddings_flat, 2), axis=1)
+
+ all_features = []
+ for n in range(n_chunks):
+ if n_chunks == 1:
+ query_embeddings_flat_chunk = query_embeddings_flat
+ query_square_chunk = query_square
+ chunk_start = 0
+ else:
+ chunk_start = n * chunk_size
+ chunk_end = (n + 1) * chunk_size
+ query_square_chunk = query_square[chunk_start:chunk_end]
+ if query_square_chunk.shape[0] == 0:
+ continue
+ query_embeddings_flat_chunk = query_embeddings_flat[
+ chunk_start:chunk_end]
+ features = _nn_features_per_object_for_chunk(
+ reference_embeddings_flat, ref_square, query_embeddings_flat_chunk,
+ query_square_chunk, wrong_label_mask)
+ all_features.append(features)
+ if n_chunks == 1:
+ nn_features = all_features[0]
+ else:
+ nn_features = paddle.concat(all_features, axis=0)
+
+ return nn_features
+
+
+def global_matching(reference_embeddings,
+ query_embeddings,
+ reference_labels,
+ n_chunks=100,
+ dis_bias=0.,
+ ori_size=None,
+ atrous_rate=1,
+ use_float16=True,
+ atrous_obj_pixel_num=0):
+ """
+ Calculates the distance to the nearest neighbor per object.
+ For every pixel of query_embeddings calculate the distance to the
+ nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+ Args:
+ reference_embeddings: [height, width, embedding_dim],
+ the embedding vectors for the reference frame.
+ query_embeddings: [height, width,
+ embedding_dim], the embedding vectors for the query frames.
+ reference_labels: [height, width, obj_nums],
+ the class labels of the reference frame.
+ n_chunks: Integer, the number of chunks to use to save memory
+ (set to 1 for no chunking).
+ dis_bias: [n_objects], foreground and background bias
+ ori_size: (ori_height, ori_width),
+ the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+ atrous_rate: Integer, the atrous rate of reference_embeddings.
+ use_float16: Bool, if "True", use float16 type for matching.
+ Returns:
+ nn_features: [1, ori_height, ori_width, n_objects, feature_dim].
+ """
+
+ assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])
+ if use_float16:
+ query_embeddings = paddle.cast(query_embeddings, dtype="float16")
+ reference_embeddings = paddle.cast(reference_embeddings,
+ dtype="float16")
+ h, w, embedding_dim = query_embeddings.shape
+ obj_nums = reference_labels.shape[2]
+
+ if atrous_rate > 1:
+ h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+ w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+ selected_points = paddle.zeros([h + h_pad, w + w_pad])
+ selected_points = selected_points.view(
+ (h + h_pad) // atrous_rate, atrous_rate, (w + w_pad) // atrous_rate,
+ atrous_rate)
+ selected_points[:, 0, :, 0] = 1.
+ selected_points = paddle.reshape(selected_points,
+ [h + h_pad, w + w_pad, 1])[:h, :w]
+ is_big_obj = (paddle.sum(
+ reference_labels,
+ axis=(0, 1))) > (atrous_obj_pixel_num * atrous_rate**2)
+ reference_labels[:, :,
+ is_big_obj] = reference_labels[:, :,
+ is_big_obj] * selected_points
+
+ reference_embeddings_flat = paddle.reshape(reference_embeddings,
+ [-1, embedding_dim])
+ reference_labels_flat = paddle.reshape(reference_labels, [-1, obj_nums])
+ query_embeddings_flat = paddle.reshape(query_embeddings,
+ [-1, embedding_dim])
+
+ all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9
+ reference_labels_flat = paddle.reshape(
+ paddle.masked_select(reference_labels_flat,
+ paddle.expand(all_ref_fg, [-1, obj_nums])),
+ [-1, obj_nums])
+ if reference_labels_flat.shape[0] == 0:
+ return paddle.ones([1, h, w, obj_nums, 1])
+ reference_embeddings_flat = paddle.reshape(
+ paddle.masked_select(reference_embeddings_flat,
+ paddle.expand(all_ref_fg, [-1, embedding_dim])),
+ [-1, embedding_dim])
+
+ nn_features = _nearest_neighbor_features_per_object_in_chunks(
+ reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+ n_chunks)
+
+ nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])
+ nn_features_reshape = (
+ F.sigmoid(nn_features_reshape +
+ paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2
+
+ #TODO: ori_size is not None
+
+ if use_float16:
+ nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32")
+ return nn_features_reshape
+
+
+def global_matching_for_eval(all_reference_embeddings,
+ query_embeddings,
+ all_reference_labels,
+ n_chunks=20,
+ dis_bias=0.,
+ ori_size=None,
+ atrous_rate=1,
+ use_float16=True,
+ atrous_obj_pixel_num=0):
+ """
+ Calculates the distance to the nearest neighbor per object.
+ For every pixel of query_embeddings calculate the distance to the
+ nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+ Args:
+ all_reference_embeddings: A list of reference_embeddings,
+ each with size [height, width, embedding_dim],
+ the embedding vectors for the reference frame.
+ query_embeddings: [n_query_images, height, width,
+ embedding_dim], the embedding vectors for the query frames.
+ all_reference_labels: A list of reference_labels,
+ each with size [height, width, obj_nums],
+ the class labels of the reference frame.
+ n_chunks: Integer, the number of chunks to use to save memory
+ (set to 1 for no chunking).
+ dis_bias: [n_objects], foreground and background bias
+ ori_size: (ori_height, ori_width),
+ the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+ atrous_rate: Integer, the atrous rate of reference_embeddings.
+ use_float16: Bool, if "True", use float16 type for matching.
+ Returns:
+ nn_features: [n_query_images, ori_height, ori_width, n_objects, feature_dim].
+ """
+
+ h, w, embedding_dim = query_embeddings.shape
+ obj_nums = all_reference_labels[0].shape[2]
+ all_reference_embeddings_flat = []
+ all_reference_labels_flat = []
+ ref_num = len(all_reference_labels)
+ n_chunks *= ref_num
+ if atrous_obj_pixel_num > 0:
+ if atrous_rate > 1:
+ h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+ w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+ selected_points = paddle.zeros([h + h_pad, w + w_pad])
+ selected_points = paddle.reshape(
+ selected_points, [(h + h_pad) // atrous_rate, atrous_rate,
+ (w + w_pad) // atrous_rate, atrous_rate])
+ selected_points[:, 0, :, 0] = 1.
+ selected_points = paddle.reshape(selected_points,
+ [h + h_pad, w + w_pad, 1])[:h, :w]
+
+ for reference_embeddings, reference_labels, idx in zip(
+ all_reference_embeddings, all_reference_labels, range(ref_num)):
+ if atrous_rate > 1:
+ is_big_obj = paddle.sum(
+ reference_labels,
+ axis=(0, 1)) > (atrous_obj_pixel_num * atrous_rate**2)
+ is_big_obj = list(np.array(is_big_obj))
+ for j in range(len(is_big_obj)):
+ if is_big_obj[j] == True:
+ reference_labels[:, :, j:j +
+ 1] = reference_labels[:, :, j:j +
+ 1] * selected_points
+
+ reference_embeddings_flat = paddle.reshape(reference_embeddings,
+ [-1, embedding_dim])
+ reference_labels_flat = paddle.reshape(reference_labels,
+ [-1, obj_nums])
+
+ all_reference_embeddings_flat.append(reference_embeddings_flat)
+ all_reference_labels_flat.append(reference_labels_flat)
+
+ reference_embeddings_flat = paddle.concat(
+ x=all_reference_embeddings_flat, axis=0)
+ reference_labels_flat = paddle.concat(x=all_reference_labels_flat,
+ axis=0)
+ else:
+ if ref_num == 1:
+ reference_embeddings, reference_labels = all_reference_embeddings[
+ 0], all_reference_labels[0]
+ if atrous_rate > 1:
+ h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+ w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+ if h_pad > 0 or w_pad > 0:
+ reference_embeddings = F.pad(reference_embeddings,
+ [0, h_pad, 0, w_pad, 0, 0])
+ reference_labels = F.pad(reference_labels,
+ [0, h_pad, 0, w_pad, 0, 0])
+ reference_embeddings = paddle.reshape(
+ reference_embeddings,
+ [(h + h_pad) // atrous_rate, atrous_rate,
+ (w + w_pad) // atrous_rate, atrous_rate, 32])
+ reference_labels = paddle.reshape(
+ reference_labels,
+ [(h + h_pad) // atrous_rate, atrous_rate,
+ (w + w_pad) // atrous_rate, atrous_rate, -1])
+ reference_embeddings = paddle.reshape(
+ reference_embeddings[:, 0, :, 0, :],
+ reference_embeddings[:, 0, :, 0, :].shape)
+ reference_labels = paddle.reshape(
+ reference_labels[:, 0, :, 0, :],
+ reference_labels[:, 0, :, 0, :].shape)
+ reference_embeddings_flat = paddle.reshape(reference_embeddings,
+ [-1, embedding_dim])
+ reference_labels_flat = paddle.reshape(reference_labels,
+ [-1, obj_nums])
+ else:
+ for reference_embeddings, reference_labels, idx in zip(
+ all_reference_embeddings, all_reference_labels,
+ range(ref_num)):
+ if atrous_rate > 1:
+ h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+ w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+ if h_pad > 0 or w_pad > 0:
+ reference_embeddings = F.pad(reference_embeddings,
+ [0, h_pad, 0, w_pad, 0, 0])
+ reference_labels = F.pad(reference_labels,
+ [0, h_pad, 0, w_pad, 0, 0])
+
+ reference_embeddings = paddle.reshape(
+ reference_embeddings,
+ [(h + h_pad) // atrous_rate, atrous_rate,
+ (w + w_pad) // atrous_rate, atrous_rate, -1])
+ reference_labels = paddle.reshape(
+ reference_labels,
+ [(h + h_pad) // atrous_rate, atrous_rate,
+ (w + w_pad) // atrous_rate, atrous_rate, -1])
+ reference_embeddings = paddle.reshape(
+ reference_embeddings[:, 0, :, 0, :],
+ reference_embeddings[:, 0, :, 0, :].shape)
+ reference_labels = paddle.reshape(
+ reference_labels[:, 0, :, 0, :],
+ reference_labels[:, 0, :, 0, :].shape)
+
+ reference_embeddings_flat = paddle.reshape(
+ reference_embeddings, [-1, embedding_dim])
+ reference_labels_flat = paddle.reshape(reference_labels,
+ [-1, obj_nums])
+
+ all_reference_embeddings_flat.append(reference_embeddings_flat)
+ all_reference_labels_flat.append(reference_labels_flat)
+
+ reference_embeddings_flat = paddle.concat(
+ all_reference_embeddings_flat, axis=0)
+ reference_labels_flat = paddle.concat(all_reference_labels_flat,
+ axis=0)
+
+ query_embeddings_flat = paddle.reshape(query_embeddings,
+ [-1, embedding_dim])
+
+ all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9
+ reference_labels_flat = paddle.reshape(
+ paddle.masked_select(reference_labels_flat,
+ paddle.expand(all_ref_fg, [-1, obj_nums])),
+ [-1, obj_nums])
+ if reference_labels_flat.shape[0] == 0:
+ return paddle.ones([1, h, w, obj_nums, 1])
+ reference_embeddings_flat = paddle.reshape(
+ paddle.masked_select(reference_embeddings_flat,
+ paddle.expand(all_ref_fg, [-1, embedding_dim])),
+ [-1, embedding_dim])
+ if use_float16:
+ query_embeddings_flat = paddle.cast(query_embeddings_flat,
+ dtype="float16")
+ reference_embeddings_flat = paddle.cast(reference_embeddings_flat,
+ dtype="float16")
+ nn_features = _nearest_neighbor_features_per_object_in_chunks(
+ reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+ n_chunks)
+
+ nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])
+ nn_features_reshape = (
+ F.sigmoid(nn_features_reshape +
+ paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2
+
+ # TODO: ori_size is not None
+
+ if use_float16:
+ nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32")
+ return nn_features_reshape
+
+
+#LOCAL_DIST_MAP
+def local_pairwise_distances(x,
+ y,
+ max_distance=9,
+ atrous_rate=1,
+ allow_downsample=False):
+ """Computes pairwise squared l2 distances using a local search window.
+ Use for-loop for saving memory.
+ Args:
+ x: Float32 tensor of shape [height, width, feature_dim].
+ y: Float32 tensor of shape [height, width, feature_dim].
+ max_distance: Integer, the maximum distance in pixel coordinates
+ per dimension which is considered to be in the search window.
+ atrous_rate: Integer, the atrous rate of local matching.
+ allow_downsample: Bool, if "True", downsample x and y
+ with a stride of 2.
+ Returns:
+ Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].
+ """
+ if allow_downsample:
+ ori_height = x.shape[0]
+ ori_width = x.shape[1]
+ x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)
+ y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)
+ down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)
+ x = F.interpolate(x,
+ size=down_size,
+ mode='bilinear',
+ align_corners=True)
+ y = F.interpolate(y,
+ size=down_size,
+ mode='bilinear',
+ align_corners=True)
+ x = paddle.unsqueeze(paddle.transpose(x, [1, 2, 0]), axis=0)
+ y = paddle.unsqueeze(paddle.transpose(y, [1, 2, 0]), axis=0)
+
+ pad_max_distance = max_distance - max_distance % atrous_rate
+ # no change pad
+ padded_y = F.pad(y, (0, 0, pad_max_distance, pad_max_distance,
+ pad_max_distance, pad_max_distance),
+ value=WRONG_LABEL_PADDING_DISTANCE)
+
+ height, width, _ = x.shape
+ dists = []
+ for y in range(2 * pad_max_distance // atrous_rate + 1):
+ y_start = y * atrous_rate
+ y_end = y_start + height
+ y_slice = padded_y[y_start:y_end]
+ for x in range(2 * max_distance + 1):
+ x_start = x * atrous_rate
+ x_end = x_start + width
+ offset_y = y_slice[:, x_start:x_end]
+ dist = paddle.sum(paddle.pow((x - offset_y), 2), axis=2)
+ dists.append(dist)
+ dists = paddle.stack(dists, axis=2)
+
+ return dists
+
+
+def local_pairwise_distances_parallel(x,
+ y,
+ max_distance=9,
+ atrous_rate=1,
+ allow_downsample=True):
+ """Computes pairwise squared l2 distances using a local search window.
+ Args:
+ x: Float32 tensor of shape [height, width, feature_dim].
+ y: Float32 tensor of shape [height, width, feature_dim].
+ max_distance: Integer, the maximum distance in pixel coordinates
+ per dimension which is considered to be in the search window.
+ atrous_rate: Integer, the atrous rate of local matching.
+ allow_downsample: Bool, if "True", downsample x and y
+ with a stride of 2.
+ Returns:
+ Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].
+ """
+
+ ori_height, ori_width, _ = x.shape
+ x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)
+ y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)
+ if allow_downsample:
+ down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)
+ x = F.interpolate(x,
+ size=down_size,
+ mode='bilinear',
+ align_corners=True)
+ y = F.interpolate(y,
+ size=down_size,
+ mode='bilinear',
+ align_corners=True)
+
+ _, channels, height, width = x.shape
+
+ x2 = paddle.reshape(paddle.sum(paddle.pow(x, 2), axis=1),
+ [height, width, 1])
+ y2 = paddle.reshape(paddle.sum(paddle.pow(y, 2), axis=1),
+ [1, 1, height, width])
+
+ pad_max_distance = max_distance - max_distance % atrous_rate
+ # no change pad
+ padded_y = F.pad(y, (pad_max_distance, pad_max_distance, pad_max_distance,
+ pad_max_distance))
+ padded_y2 = F.pad(y2, (pad_max_distance, pad_max_distance, pad_max_distance,
+ pad_max_distance),
+ value=WRONG_LABEL_PADDING_DISTANCE)
+
+ offset_y = paddle.transpose(
+ paddle.reshape(
+ F.unfold(x=padded_y,
+ kernel_sizes=[height, width],
+ strides=[atrous_rate, atrous_rate]),
+ [channels, height * width, -1]), [1, 0, 2])
+ offset_y2 = paddle.reshape(
+ F.unfold(padded_y2,
+ kernel_sizes=[height, width],
+ strides=[atrous_rate, atrous_rate]), [height, width, -1])
+ x = paddle.transpose(paddle.reshape(x, [channels, height * width, -1]),
+ [1, 2, 0])
+
+ dists = x2 + offset_y2 - 2. * paddle.reshape(paddle.matmul(x, offset_y),
+ [height, width, -1])
+
+ return dists
+
+
+def local_matching(prev_frame_embedding,
+ query_embedding,
+ prev_frame_labels,
+ dis_bias=0.,
+ multi_local_distance=[15],
+ ori_size=None,
+ atrous_rate=1,
+ use_float16=True,
+ allow_downsample=True,
+ allow_parallel=True):
+ """Computes nearest neighbor features while only allowing local matches.
+ Args:
+ prev_frame_embedding: [height, width, embedding_dim],
+ the embedding vectors for the last frame.
+ query_embedding: [height, width, embedding_dim],
+ the embedding vectors for the query frames.
+ prev_frame_labels: [height, width, n_objects],
+ the class labels of the previous frame.
+ multi_local_distance: A list of Integer,
+ a list of maximum distance allowed for local matching.
+ ori_size: (ori_height, ori_width),
+ the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+ atrous_rate: Integer, the atrous rate of local matching.
+ use_float16: Bool, if "True", use float16 type for matching.
+ allow_downsample: Bool, if "True", downsample prev_frame_embedding and query_embedding
+ with a stride of 2.
+ allow_parallel: Bool, if "True", do matching in a parallel way. If "False", do matching in
+ a for-loop way, which will save GPU memory.
+ Returns:
+ nn_features: A float32 np.array of nearest neighbor features of shape
+ [1, height, width, n_objects, 1].
+ """
+ max_distance = multi_local_distance[-1]
+
+ if ori_size is None:
+ height, width = prev_frame_embedding.shape[:2]
+ ori_size = (height, width)
+
+ obj_num = prev_frame_labels.shape[2]
+ pad = paddle.ones([1]) * WRONG_LABEL_PADDING_DISTANCE
+ if use_float16:
+ query_embedding = paddle.cast(query_embedding, dtype="float16")
+ prev_frame_embedding = paddle.cast(prev_frame_embedding,
+ dtype="float16")
+ pad = paddle.cast(pad, dtype="float16")
+
+ if allow_parallel:
+ d = local_pairwise_distances_parallel(query_embedding,
+ prev_frame_embedding,
+ max_distance=max_distance,
+ atrous_rate=atrous_rate,
+ allow_downsample=allow_downsample)
+ else:
+ d = local_pairwise_distances(query_embedding,
+ prev_frame_embedding,
+ max_distance=max_distance,
+ atrous_rate=atrous_rate,
+ allow_downsample=allow_downsample)
+
+ height, width = d.shape[:2]
+
+ labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]), 1)
+ labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]),
+ axis=1)
+ if (height, width) != ori_size:
+ labels = F.interpolate(labels, size=(height, width), mode='nearest')
+
+ pad_max_distance = max_distance - max_distance % atrous_rate
+ atrous_max_distance = pad_max_distance // atrous_rate
+ #no change pad
+ padded_labels = F.pad(labels, (
+ pad_max_distance,
+ pad_max_distance,
+ pad_max_distance,
+ pad_max_distance,
+ ),
+ mode='constant',
+ value=0)
+
+ offset_masks = paddle.transpose(
+ paddle.reshape(
+ F.unfold(padded_labels,
+ kernel_sizes=[height, width],
+ strides=[atrous_rate, atrous_rate]),
+ [obj_num, height, width, -1]), [1, 2, 3, 0]) > 0.9
+
+ d_tiled = paddle.expand(paddle.unsqueeze(
+ d, axis=-1), [-1, -1, -1, obj_num]) # h, w, num_local_pos, obj_num
+
+ d_masked = paddle.where(offset_masks, d_tiled, pad)
+ dists = paddle.min(d_masked, axis=2)
+ multi_dists = [
+ paddle.unsqueeze(paddle.transpose(dists, [2, 0, 1]), axis=1)
+ ] # n_objects, num_multi_local, h, w
+
+ reshaped_d_masked = paddle.reshape(d_masked, [
+ height, width, 2 * atrous_max_distance + 1, 2 * atrous_max_distance + 1,
+ obj_num
+ ])
+ for local_dis in multi_local_distance[:-1]:
+ local_dis = local_dis // atrous_rate
+ start_idx = atrous_max_distance - local_dis
+ end_idx = atrous_max_distance + local_dis + 1
+ new_d_masked = paddle.reshape(
+ reshaped_d_masked[:, :, start_idx:end_idx, start_idx:end_idx, :],
+ reshaped_d_masked[:, :, start_idx:end_idx,
+ start_idx:end_idx, :].shape)
+ new_d_masked = paddle.reshape(new_d_masked,
+ [height, width, -1, obj_num])
+ new_dists = paddle.min(new_d_masked, axis=2)
+ new_dists = paddle.unsqueeze(paddle.transpose(new_dists, [2, 0, 1]),
+ axis=1)
+ multi_dists.append(new_dists)
+
+ multi_dists = paddle.concat(multi_dists, axis=1)
+ multi_dists = (F.sigmoid(multi_dists +
+ paddle.reshape(dis_bias, [-1, 1, 1, 1])) - 0.5) * 2
+
+ if use_float16:
+ multi_dists = paddle.cast(multi_dists, dtype="float32")
+
+ if (height, width) != ori_size:
+ multi_dists = F.interpolate(multi_dists,
+ size=ori_size,
+ mode='bilinear',
+ align_corners=True)
+ multi_dists = paddle.transpose(multi_dists, perm=[2, 3, 0, 1])
+ multi_dists = paddle.reshape(multi_dists,
+ [1, ori_size[0], ori_size[1], obj_num, -1])
+
+ return multi_dists
+
+
+def calculate_attention_head(ref_embedding,
+ ref_label,
+ prev_embedding,
+ prev_label,
+ epsilon=1e-5):
+
+ ref_head = ref_embedding * ref_label
+ ref_head_pos = paddle.sum(ref_head, axis=(2, 3))
+ ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos
+ ref_pos_num = paddle.sum(ref_label, axis=(2, 3))
+ ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))
+ ref_head_pos = ref_head_pos / (ref_pos_num + epsilon)
+ ref_head_neg = ref_head_neg / (ref_neg_num + epsilon)
+
+ prev_head = prev_embedding * prev_label
+ prev_head_pos = paddle.sum(prev_head, axis=(2, 3))
+ prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos
+ prev_pos_num = paddle.sum(prev_label, axis=(2, 3))
+ prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))
+ prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)
+ prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)
+
+ total_head = paddle.concat(
+ x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)
+
+ return total_head
+
+
+def calculate_attention_head_for_eval(ref_embeddings,
+ ref_labels,
+ prev_embedding,
+ prev_label,
+ epsilon=1e-5):
+ total_ref_head_pos = 0.
+ total_ref_head_neg = 0.
+ total_ref_pos_num = 0.
+ total_ref_neg_num = 0.
+
+ for idx in range(len(ref_embeddings)):
+ ref_embedding = ref_embeddings[idx]
+ ref_label = ref_labels[idx]
+ ref_head = ref_embedding * ref_label
+ ref_head_pos = paddle.sum(ref_head, axis=(2, 3))
+ ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos
+ ref_pos_num = paddle.sum(ref_label, axis=(2, 3))
+ ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))
+ total_ref_head_pos = total_ref_head_pos + ref_head_pos
+ total_ref_head_neg = total_ref_head_neg + ref_head_neg
+ total_ref_pos_num = total_ref_pos_num + ref_pos_num
+ total_ref_neg_num = total_ref_neg_num + ref_neg_num
+ ref_head_pos = total_ref_head_pos / (total_ref_pos_num + epsilon)
+ ref_head_neg = total_ref_head_neg / (total_ref_neg_num + epsilon)
+
+ prev_head = prev_embedding * prev_label
+ prev_head_pos = paddle.sum(prev_head, axis=(2, 3))
+ prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos
+ prev_pos_num = paddle.sum(prev_label, axis=(2, 3))
+ prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))
+ prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)
+ prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)
+
+ total_head = paddle.concat(
+ x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)
+ return total_head
diff --git a/paddlevideo/modeling/framework/segmenters/__init__.py b/paddlevideo/modeling/framework/segmenters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..de4bf573426a6b53831bbab6a5c59ab22df8e775
--- /dev/null
+++ b/paddlevideo/modeling/framework/segmenters/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseSegmenter
+from .ms_tcn import MSTCN
+from .asrf import ASRF
+
+__all__ = ['BaseSegmenter', 'MSTCN', 'ASRF']
diff --git a/paddlevideo/modeling/framework/segmenters/asrf.py b/paddlevideo/modeling/framework/segmenters/asrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d962c714e49e131078def93f6fd414d38af76c9
--- /dev/null
+++ b/paddlevideo/modeling/framework/segmenters/asrf.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import SEGMENTERS
+from .base import BaseSegmenter
+
+import paddle
+import paddle.nn.functional as F
+from .utils import ASRFPostProcessing
+
+
+@SEGMENTERS.register()
+class ASRF(BaseSegmenter):
+ """ASRF model framework."""
+
+ def __init__(self,
+ postprocessing_method,
+ boundary_threshold,
+ backbone=None,
+ head=None,
+ loss=None):
+
+ super().__init__(backbone=backbone, head=head, loss=loss)
+ self.postprocessing_method = postprocessing_method
+ self.boundary_threshold = boundary_threshold
+
+ def forward_net(self, video_feature):
+ """Define how the model is going to train, from input to output.
+ """
+ if self.backbone is not None:
+ feature = self.backbone(video_feature)
+ else:
+ feature = video_feature
+
+ if self.head is not None:
+ network_outputs = self.head(feature)
+ else:
+ network_outputs = None
+
+ return network_outputs
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ feature, label, boundary = data_batch
+ # call forward
+ outputs_cls, outputs_boundary = self.forward_net(feature)
+
+ # transfer data
+ outputs_cls_np = outputs_cls[-1].numpy()
+ outputs_boundary_np = outputs_boundary[-1].numpy()
+
+ # caculate loss
+ if self.loss is not None:
+ output_loss = self.loss(feature, outputs_cls, label,
+ outputs_boundary, boundary)
+ else:
+ output_loss = None
+
+ # predict post process
+ predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+ self.postprocessing_method)
+ predicted = paddle.squeeze(predicted)
+
+ loss_metrics = dict()
+ loss_metrics['loss'] = output_loss
+ loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, label)
+
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ feature, label, boundary = data_batch
+
+ # call forward
+ outputs_cls, outputs_boundary = self.forward_net(feature)
+
+ # transfer data
+ outputs_cls_np = outputs_cls[-1].numpy()
+ outputs_boundary_np = outputs_boundary[-1].numpy()
+
+ ## caculate loss
+ if self.loss is not None:
+ output_loss = self.loss(feature, outputs_cls, label,
+ outputs_boundary, boundary)
+ else:
+ output_loss = None
+
+ # predict post process
+ predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+ self.postprocessing_method)
+ predicted = paddle.squeeze(predicted)
+
+ outputs_dict = dict()
+ outputs_dict['loss'] = output_loss
+ outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, label)
+ return outputs_dict
+
+ def test_step(self, data_batch):
+ """Testing setp.
+ """
+ feature, _, _ = data_batch
+
+ outputs_dict = dict()
+ # call forward
+ outputs_cls, outputs_boundary = self.forward_net(feature)
+ # transfer data
+ outputs_cls_np = outputs_cls[-1].numpy()
+ outputs_boundary_np = outputs_boundary[-1].numpy()
+
+ # predict post process
+ predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+ self.postprocessing_method)
+ outputs_dict['predict'] = paddle.to_tensor(predicted[0, :])
+ outputs_dict['output_np'] = F.sigmoid(outputs_cls[-1])
+ return outputs_dict
+
+ def infer_step(self, data_batch):
+ """Infering setp.
+ """
+ feature = data_batch[0]
+
+ # call forward
+ outputs_cls, outputs_boundary = self.forward_net(feature)
+ # transfer data
+ outputs_cls_np = outputs_cls[-1]
+ outputs_boundary_np = outputs_boundary[-1]
+
+ outputs = [
+ outputs_cls_np, outputs_boundary_np,
+ F.sigmoid(outputs_cls[-1])
+ ]
+ return outputs
diff --git a/paddlevideo/modeling/framework/segmenters/base.py b/paddlevideo/modeling/framework/segmenters/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0856d9ad24af228479eed7f30ec7ab6dd81172f
--- /dev/null
+++ b/paddlevideo/modeling/framework/segmenters/base.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseSegmenter(nn.Layer):
+ """Base class for segementers.
+
+ All segementers should subclass it.
+ All subclass should overwrite:
+
+ - Methods:``train_step``, supporting to forward when training.
+ - Methods:``valid_step``, supporting to forward when validating.
+ - Methods:``test_step``, supporting to forward when testing.
+
+ Args:
+ backbone (dict): Backbone modules to extract feature.
+ head (dict): Classification head to process feature.
+
+ """
+
+ def __init__(self, backbone=None, head=None, loss=None):
+
+ super().__init__()
+ # build backbone
+ if backbone is not None:
+ self.backbone = builder.build_backbone(backbone)
+ if hasattr(self.backbone, 'init_weights'):
+ self.backbone.init_weights()
+ else:
+ self.backbone = None
+ # build head
+ if head is not None:
+ self.head_name = head.name
+ self.head = builder.build_head(head)
+ if hasattr(self.head, 'init_weights'):
+ self.head.init_weights()
+ else:
+ self.head = None
+ # build loss
+ if loss is not None:
+ self.loss_name = loss.name
+ self.loss = builder.build_loss(loss)
+ if hasattr(self.loss, 'init_weights'):
+ self.loss.init_weights()
+ else:
+ self.loss = None
+
+ def forward(self, data_batch, mode='infer'):
+ """
+ 1. Define how the model is going to run, from input to output.
+ 2. Console of train, valid, test or infer step
+ 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+ """
+ if mode == 'train':
+ return self.train_step(data_batch)
+ elif mode == 'valid':
+ return self.val_step(data_batch)
+ elif mode == 'test':
+ return self.test_step(data_batch)
+ elif mode == 'infer':
+ return self.infer_step(data_batch)
+ else:
+ raise NotImplementedError
+
+ @abstractmethod
+ def train_step(self, data_batch, **kwargs):
+ """Training step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def val_step(self, data_batch, **kwargs):
+ """Validating step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def test_step(self, data_batch, **kwargs):
+ """Test step.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def infer_step(self, data_batch, **kwargs):
+ """Infer step.
+ """
+ raise NotImplementedError
diff --git a/paddlevideo/modeling/framework/segmenters/ms_tcn.py b/paddlevideo/modeling/framework/segmenters/ms_tcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5982a7c990c2b505ceeea9f75a544ffcac9ba19
--- /dev/null
+++ b/paddlevideo/modeling/framework/segmenters/ms_tcn.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import SEGMENTERS
+from .base import BaseSegmenter
+
+import paddle
+import paddle.nn.functional as F
+
+
+@SEGMENTERS.register()
+class MSTCN(BaseSegmenter):
+ """MS-TCN model framework."""
+
+ def forward_net(self, video_feature):
+ """Define how the model is going to train, from input to output.
+ """
+ if self.backbone is not None:
+ feature = self.backbone(video_feature)
+ else:
+ feature = video_feature
+
+ if self.head is not None:
+ cls_score = self.head(feature)
+ else:
+ cls_score = None
+
+ return cls_score
+
+ def train_step(self, data_batch):
+ """Training step.
+ """
+ video_feat, video_gt = data_batch
+
+ # call forward
+ output = self.forward_net(video_feat)
+ loss = 0.
+ for i in range(len(output)):
+ loss += self.head.loss(output[i], video_gt)
+
+ predicted = paddle.argmax(output[-1], axis=1)
+ predicted = paddle.squeeze(predicted)
+
+ loss_metrics = dict()
+ loss_metrics['loss'] = loss
+ loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)
+ return loss_metrics
+
+ def val_step(self, data_batch):
+ """Validating setp.
+ """
+ video_feat, video_gt = data_batch
+
+ # call forward
+ output = self.forward_net(video_feat)
+ loss = 0.
+ for i in range(len(output)):
+ loss += self.head.loss(output[i], video_gt)
+
+ predicted = paddle.argmax(output[-1], axis=1)
+ predicted = paddle.squeeze(predicted)
+
+ outputs_dict = dict()
+ outputs_dict['loss'] = loss
+ outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)
+ return outputs_dict
+
+ def test_step(self, data_batch):
+ """Testing setp.
+ """
+ video_feat, _ = data_batch
+
+ outputs_dict = dict()
+ # call forward
+ output = self.forward_net(video_feat)
+ predicted = paddle.argmax(output[-1], axis=1)
+ predicted = paddle.squeeze(predicted)
+ outputs_dict['predict'] = predicted
+ outputs_dict['output_np'] = F.sigmoid(output[-1])
+ return outputs_dict
+
+ def infer_step(self, data_batch):
+ """Infering setp.
+ """
+ video_feat = data_batch[0]
+
+ # call forward
+ output = self.forward_net(video_feat)
+ predicted = paddle.argmax(output[-1], axis=1)
+ predicted = paddle.squeeze(predicted)
+ output_np = F.sigmoid(output[-1])
+ return predicted, output_np
diff --git a/paddlevideo/modeling/framework/segmenters/utils.py b/paddlevideo/modeling/framework/segmenters/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c21cbb869a0f598a1a35e5267f5187a0a1249d6
--- /dev/null
+++ b/paddlevideo/modeling/framework/segmenters/utils.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# https://github.com/yiskw713/asrf/libs/postprocess.py
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import math
+
+
+class GaussianSmoothing(nn.Layer):
+ """
+ Apply gaussian smoothing on a 1d tensor.
+ Filtering is performed seperately for each channel
+ in the input using a depthwise convolution.
+ Arguments:
+ channels (int, sequence): Number of channels of the input tensors. Output will
+ have this number of channels as well.
+ kernel_size (int, sequence): Size of the gaussian kernel.
+ sigma (float, sequence): Standard deviation of the gaussian kernel.
+ """
+
+ def __init__(self, kernel_size=15, sigma=1.0):
+ super().__init__()
+ self.kernel_size = kernel_size
+
+ # The gaussian kernel is the product of the
+ # gaussian function of each dimension.
+ kernel = 1
+ meshgrid = paddle.arange(kernel_size)
+
+ meshgrid = paddle.cast(meshgrid, dtype='float32')
+
+ mean = (kernel_size - 1) / 2
+ kernel = kernel / (sigma * math.sqrt(2 * math.pi))
+ kernel = kernel * paddle.exp(-(((meshgrid - mean) / sigma)**2) / 2)
+
+ # Make sure sum of values in gaussian kernel equals 1.
+ # kernel = kernel / paddle.max(kernel)
+
+ self.kernel = paddle.reshape(kernel, [1, 1, -1])
+
+ def forward(self, inputs):
+ """
+ Apply gaussian filter to input.
+ Arguments:
+ input (paddle.Tensor): Input to apply gaussian filter on.
+ Returns:
+ filtered (paddle.Tensor): Filtered output.
+ """
+ _, c, _ = inputs.shape
+ inputs = F.pad(inputs,
+ pad=((self.kernel_size - 1) // 2,
+ (self.kernel_size - 1) // 2),
+ mode="reflect",
+ data_format='NCL')
+
+ kernel = paddle.expand(self.kernel, shape=[c, 1, self.kernel_size])
+ return F.conv1d(inputs, weight=kernel, groups=c)
+
+
+def argrelmax(prob, threshold=0.7):
+ """
+ Calculate arguments of relative maxima.
+ prob: np.array. boundary probability maps distributerd in [0, 1]
+ prob shape is (T)
+ ignore the peak whose value is under threshold
+
+ Return:
+ Index of peaks for each batch
+ """
+ # ignore the values under threshold
+ prob[prob < threshold] = 0.0
+
+ # calculate the relative maxima of boundary maps
+ # treat the first frame as boundary
+ peak = np.concatenate(
+ [
+ np.ones((1), dtype=np.bool),
+ (prob[:-2] < prob[1:-1]) & (prob[2:] < prob[1:-1]),
+ np.zeros((1), dtype=np.bool),
+ ],
+ axis=0,
+ )
+
+ peak_idx = np.where(peak)[0].tolist()
+
+ return peak_idx
+
+
+def is_probability(x):
+ assert x.ndim == 3
+
+ if x.shape[1] == 1:
+ # sigmoid
+ if x.min() >= 0 and x.max() <= 1:
+ return True
+ else:
+ return False
+ else:
+ # softmax
+ _sum = np.sum(x, axis=1).astype(np.float32)
+ _ones = np.ones_like(_sum, dtype=np.float32)
+ return np.allclose(_sum, _ones)
+
+
+def convert2probability(x):
+ """
+ Args: x (N, C, T)
+ """
+ assert x.ndim == 3
+
+ if is_probability(x):
+ return x
+ else:
+ if x.shape[1] == 1:
+ # sigmoid
+ prob = 1 / (1 + np.exp(-x))
+ else:
+ # softmax
+ prob = np.exp(x) / np.sum(np.exp(x), axis=1)
+ return prob.astype(np.float32)
+
+
+def convert2label(x):
+ assert x.ndim == 2 or x.ndim == 3
+
+ if x.ndim == 2:
+ return x.astype(np.int64)
+ else:
+ if not is_probability(x):
+ x = convert2probability(x)
+
+ label = np.argmax(x, axis=1)
+ return label.astype(np.int64)
+
+
+def refinement_with_boundary(outputs, boundaries, boundary_threshold):
+ """
+ Get segments which is defined as the span b/w two boundaries,
+ and decide their classes by majority vote.
+ Args:
+ outputs: numpy array. shape (N, C, T)
+ the model output for frame-level class prediction.
+ boundaries: numpy array. shape (N, 1, T)
+ boundary prediction.
+ boundary_threshold: the threshold of the size of action segments. float(default=0.7)
+ Return:
+ preds: np.array. shape (N, T)
+ final class prediction considering boundaries.
+ """
+
+ preds = convert2label(outputs)
+ boundaries = convert2probability(boundaries)
+
+ for i, (output, pred, boundary) in enumerate(zip(outputs, preds,
+ boundaries)):
+ idx = argrelmax(boundary[0, :], threshold=boundary_threshold)
+
+ # add the index of the last action ending
+ T = pred.shape[0]
+ idx.append(T)
+
+ # majority vote
+ for j in range(len(idx) - 1):
+ count = np.bincount(pred[idx[j]:idx[j + 1]])
+ modes = np.where(count == count.max())[0]
+ if len(modes) == 1:
+ mode = modes
+ else:
+ if outputs.ndim == 3:
+ # if more than one majority class exist
+ prob_sum_max = 0
+ for m in modes:
+ prob_sum = output[m, idx[j]:idx[j + 1]].sum()
+ if prob_sum_max < prob_sum:
+ mode = m
+ prob_sum_max = prob_sum
+ else:
+ # decide first mode when more than one majority class
+ # have the same number during oracle experiment
+ mode = modes[0]
+
+ preds[i, idx[j]:idx[j + 1]] = mode
+ return preds
+
+
+def relabeling(outputs, theta_t):
+ """
+ Relabeling small action segments with their previous action segment
+ Args:
+ output: the results of action segmentation. (N, T) or (N, C, T)
+ theta_t: the threshold of the size of action segments.
+ Return:
+ relabeled output. (N, T)
+ """
+
+ preds = convert2label(outputs)
+
+ for i in range(preds.shape[0]):
+ # shape (T,)
+ last = preds[i][0]
+ cnt = 1
+ for j in range(1, preds.shape[1]):
+ if last == preds[i][j]:
+ cnt += 1
+ else:
+ if cnt > theta_t:
+ cnt = 1
+ last = preds[i][j]
+ else:
+ preds[i][j - cnt:j] = preds[i][j - cnt - 1]
+ cnt = 1
+ last = preds[i][j]
+
+ if cnt <= theta_t:
+ preds[i][j - cnt:j] = preds[i][j - cnt - 1]
+
+ return preds
+
+
+def smoothing(outputs, filter_func):
+ """
+ Smoothing action probabilities with gaussian filter.
+ Args:
+ outputs: frame-wise action probabilities. (N, C, T)
+ Return:
+ predictions: final prediction. (N, T)
+ """
+
+ outputs = convert2probability(outputs)
+ outputs = filter_func(paddle.to_tensor(outputs)).numpy()
+
+ preds = convert2label(outputs)
+ return preds
+
+
+def ASRFPostProcessing(outputs_cls,
+ outputs_boundary,
+ refinement_method,
+ boundary_threshold=0.7,
+ theta_t=15,
+ kernel_size=15):
+ """
+ ASRF post processing is to refine action boundary
+ Args:
+ outputs_cls: the results of action segmentation. (N, T) or (N, C, T)
+ outputs_boundary: action boundary probability. (N, 1, T)
+ refinement_method: the way of refine predict boundary and classification. str
+ boundary_threshold: the threshold of the size of action segments. float(default=0.7)
+ theta_t: the threshold of the size of action segments. int(default=15)
+ kernel_size: Size of the gaussian kernel. int(default=15)
+ Return:
+ preds output. (N, T)
+ """
+ func = [
+ "refinement_with_boundary",
+ "relabeling",
+ "smoothing",
+ ]
+
+ if refinement_method == "smoothing":
+ filter_func = GaussianSmoothing(kernel_size)
+ preds = smoothing(outputs_cls, filter_func)
+ elif refinement_method == "relabeling":
+ preds = relabeling(outputs_cls, theta_t)
+ elif refinement_method == "refinement_with_boundary":
+ preds = refinement_with_boundary(outputs_cls, outputs_boundary,
+ boundary_threshold)
+ else:
+ preds = np.zeros((1, 1))
+ assert refinement_method in func
+
+ return paddle.to_tensor(preds)
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+ dimensions = len(tensor.shape)
+ if dimensions < 2:
+ raise ValueError("Fan in and fan out can not be computed \
+ for tensor with fewer than 2 dimensions")
+
+ if dimensions == 2: # Linear
+ fan_in = tensor.shape[1]
+ fan_out = tensor.shape[0]
+ else:
+ num_input_fmaps = tensor.shape[1]
+ num_output_fmaps = tensor.shape[0]
+ receptive_field_size = 1
+ if tensor.dim() > 2:
+ receptive_field_size = tensor[0][0].numel()
+ fan_in = num_input_fmaps * receptive_field_size
+ fan_out = num_output_fmaps * receptive_field_size
+
+ return fan_in, fan_out
+
+
+def calculate_gain(nonlinearity=None, a=None):
+ if nonlinearity == 'tanh':
+ return 5.0 / 3
+ elif nonlinearity == 'relu':
+ return math.sqrt(2.0)
+ elif nonlinearity == 'leaky_relu':
+ if a is not None:
+ return math.sqrt(2.0 / (1 + a**2))
+ else:
+ return math.sqrt(2.0 / (1 + 0.01**2))
+ elif nonlinearity == 'selu':
+ return 3.0 / 4
+ else:
+ return 1
+
+
+def KaimingUniform_like_torch(weight_npy,
+ mode='fan_in',
+ nonlinearity='leaky_relu'):
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+ if mode == 'fan_in':
+ fan_mode = fan_in
+ else:
+ fan_mode = fan_out
+ a = math.sqrt(5.0)
+ gain = calculate_gain(nonlinearity=nonlinearity, a=a)
+ std = gain / math.sqrt(fan_mode)
+ bound = math.sqrt(3.0) * std
+ return np.random.uniform(-bound, bound, weight_npy.shape)
+
+
+def init_bias(weight_npy, bias_npy):
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+ bound = 1.0 / math.sqrt(fan_in)
+ return np.random.uniform(-bound, bound, bias_npy.shape)
diff --git a/paddlevideo/modeling/heads/__init__.py b/paddlevideo/modeling/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ff4fd81a2b3d0338779bf970a22f113283f7b6
--- /dev/null
+++ b/paddlevideo/modeling/heads/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .adds_head import AddsHead
+from .asrf_head import ASRFHead
+from .attention_lstm_head import AttentionLstmHead
+from .base import BaseHead
+from .bbox_head import BBoxHeadAVA
+from .cfbi_head import CollaborativeEnsemblerMS
+from .i3d_head import I3DHead
+from .movinet_head import MoViNetHead
+from .ms_tcn_head import MSTCNHead
+from .pptimesformer_head import ppTimeSformerHead
+from .pptsm_head import ppTSMHead
+from .pptsn_head import ppTSNHead
+from .roi_head import AVARoIHead
+from .single_straight3d import SingleRoIExtractor3D
+from .slowfast_head import SlowFastHead
+from .stgcn_head import STGCNHead
+from .timesformer_head import TimeSformerHead
+from .transnetv2_head import TransNetV2Head
+from .tsm_head import TSMHead
+from .tsn_head import TSNHead
+from .ms_tcn_head import MSTCNHead
+from .asrf_head import ASRFHead
+from .ctrgcn_head import CTRGCNHead
+from .movinet_head import MoViNetHead
+
+__all__ = [
+ 'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead',
+ 'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head',
+ 'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead',
+ 'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead',
+ 'MoViNetHead', 'CTRGCNHead'
+]
diff --git a/paddlevideo/modeling/heads/adds_head.py b/paddlevideo/modeling/heads/adds_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1cd2462320484dd1d65e13939f55c876c5bbd6
--- /dev/null
+++ b/paddlevideo/modeling/heads/adds_head.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle.nn as nn
+from paddlevideo.utils import get_dist_info
+import paddle
+from ..builder import build_loss
+from ..registry import HEADS
+
+MIN_DEPTH = 1e-3
+MAX_DEPTH = 80
+
+
+@HEADS.register()
+class AddsHead(nn.Layer):
+ """TimeSformerHead Head.
+
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channles in input feature.
+ loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+ std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+ kwargs (dict, optional): Any keyword argument to initialize.
+
+ """
+ def __init__(self,
+ avg_reprojection,
+ disparity_smoothness,
+ no_ssim,
+ loss_cfg=dict(name='ADDSLoss'),
+ max_gt_depth=60,
+ pred_depth_scale_factor=1):
+
+ super(AddsHead, self).__init__()
+ loss_cfg['avg_reprojection'] = avg_reprojection
+ loss_cfg['disparity_smoothness'] = disparity_smoothness
+ loss_cfg['no_ssim'] = no_ssim
+ self.max_gt_depth = max_gt_depth
+ self.pred_depth_scale_factor = pred_depth_scale_factor
+ self.loss_func = build_loss(loss_cfg)
+
+ def forward(self):
+ raise NotImplemented
+
+ def loss(self, inputs, outputs):
+ if self.training:
+ return self.loss_func(inputs, outputs)
+ else:
+ abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.get_metrics(
+ outputs['pred_disp'], outputs['gt'])
+ outputs['abs_rel'] = abs_rel
+ outputs['sq_rel'] = sq_rel
+ outputs['rmse'] = rmse
+ outputs['rmse_log'] = rmse_log
+ outputs['a1'] = a1
+ outputs['a2'] = a2
+ outputs['a3'] = a3
+ return outputs
+
+ def get_metrics(self, pred_disp, gt_depth):
+ gt_height, gt_width = gt_depth.shape[:2]
+
+ pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
+ pred_depth = 1 / pred_disp
+
+ mask = gt_depth > 0
+
+ pred_depth = pred_depth[mask]
+ gt_depth = gt_depth[mask]
+
+ pred_depth *= self.pred_depth_scale_factor
+ ratio = np.median(gt_depth) / np.median(pred_depth)
+ pred_depth *= ratio
+
+ pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
+ pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH
+
+ mask2 = gt_depth <= self.max_gt_depth
+ pred_depth = pred_depth[mask2]
+ gt_depth = gt_depth[mask2]
+
+ abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.compute_errors(
+ gt_depth, pred_depth)
+
+ _, world_size = get_dist_info()
+ if world_size > 1:
+ # educe sum when valid
+ # TODO: there are some problems with multi gpu gather code.
+ abs_rel = paddle.to_tensor(abs_rel)
+ sq_rel = paddle.to_tensor(sq_rel)
+ rmse = paddle.to_tensor(rmse)
+ rmse_log = paddle.to_tensor(rmse_log)
+ a1 = paddle.to_tensor(a1)
+ a2 = paddle.to_tensor(a2)
+ a3 = paddle.to_tensor(a3)
+ abs_rel = paddle.distributed.all_reduce(
+ abs_rel, op=paddle.distributed.ReduceOp.SUM) / world_size
+ sq_rel = paddle.distributed.all_reduce(
+ sq_rel, op=paddle.distributed.ReduceOp.SUM) / world_size
+ rmse = paddle.distributed.all_reduce(
+ rmse, op=paddle.distributed.ReduceOp.SUM) / world_size
+ rmse_log = paddle.distributed.all_reduce(
+ rmse_log, op=paddle.distributed.ReduceOp.SUM) / world_size
+ a1 = paddle.distributed.all_reduce(
+ a1, op=paddle.distributed.ReduceOp.SUM) / world_size
+ a2 = paddle.distributed.all_reduce(
+ a2, op=paddle.distributed.ReduceOp.SUM) / world_size
+ a3 = paddle.distributed.all_reduce(
+ a3, op=paddle.distributed.ReduceOp.SUM) / world_size
+ return abs_rel.item(), sq_rel.item(), rmse.item(), rmse_log.item(
+ ), a1.item(), a2.item(), a3.item()
+
+ return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
+
+ def compute_errors(self, gt, pred):
+ """Computation of error metrics between predicted and ground truth depths
+ """
+ thresh = np.maximum((gt / pred), (pred / gt))
+ a1 = (thresh < 1.25).mean()
+ a2 = (thresh < 1.25**2).mean()
+ a3 = (thresh < 1.25**3).mean()
+
+ rmse = (gt - pred)**2
+ rmse = np.sqrt(rmse.mean())
+
+ rmse_log = (np.log(gt) - np.log(pred))**2
+ rmse_log = np.sqrt(rmse_log.mean())
+
+ abs_rel = np.mean(np.abs(gt - pred) / gt)
+
+ sq_rel = np.mean(((gt - pred)**2) / gt)
+
+ return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
diff --git a/paddlevideo/modeling/heads/asrf_head.py b/paddlevideo/modeling/heads/asrf_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3aab77add49e2d8f5853b3483505cadf1c8a43a
--- /dev/null
+++ b/paddlevideo/modeling/heads/asrf_head.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yiskw713/asrf/libs/models/tcn.py
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddle import ParamAttr
+
+from ..backbones.ms_tcn import SingleStageModel
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch
+
+
+@HEADS.register()
+class ASRFHead(BaseHead):
+
+ def __init__(self,
+ num_classes,
+ num_features,
+ num_stages,
+ num_layers,
+ num_stages_asb=None,
+ num_stages_brb=None):
+ super().__init__(num_classes=num_classes, in_channels=num_features)
+ if not isinstance(num_stages_asb, int):
+ num_stages_asb = num_stages
+
+ if not isinstance(num_stages_brb, int):
+ num_stages_brb = num_stages
+
+ self.num_layers = num_layers
+ self.num_stages_asb = num_stages_asb
+ self.num_stages_brb = num_stages_brb
+ self.num_features = num_features
+
+ # cls score
+ self.overlap = 0.5
+
+ self.conv_cls = nn.Conv1D(self.num_features, self.num_classes, 1)
+ self.conv_boundary = nn.Conv1D(self.num_features, 1, 1)
+
+ # action segmentation branch
+ asb = [
+ SingleStageModel(self.num_layers, self.num_features,
+ self.num_classes, self.num_classes)
+ for _ in range(self.num_stages_asb - 1)
+ ]
+
+ # boundary regression branch
+ brb = [
+ SingleStageModel(self.num_layers, self.num_features, 1, 1)
+ for _ in range(self.num_stages_brb - 1)
+ ]
+ self.brb = nn.LayerList(brb)
+ self.asb = nn.LayerList(asb)
+
+ self.activation_asb = nn.Softmax(axis=1)
+ self.activation_brb = nn.Sigmoid()
+
+ def init_weights(self):
+ """
+ initialize model layers' weight
+ """
+ # init weight
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv1D):
+ layer.weight.set_value(
+ KaimingUniform_like_torch(layer.weight).astype('float32'))
+ if layer.bias is not None:
+ layer.bias.set_value(
+ init_bias(layer.weight, layer.bias).astype('float32'))
+
+ def forward(self, x):
+ """
+ ASRF head
+ """
+ out_cls = self.conv_cls(x)
+ out_boundary = self.conv_boundary(x)
+
+ outputs_cls = [out_cls]
+ outputs_boundary = [out_boundary]
+
+ for as_stage in self.asb:
+ out_cls = as_stage(self.activation_asb(out_cls))
+ outputs_cls.append(out_cls)
+
+ for br_stage in self.brb:
+ out_boundary = br_stage(self.activation_brb(out_boundary))
+ outputs_boundary.append(out_boundary)
+
+ return outputs_cls, outputs_boundary
+
+ def get_F1_score(self, predicted, groundTruth):
+ recog_content = list(predicted.numpy())
+ gt_content = list(groundTruth[0].numpy())
+
+ # cls score
+ correct = 0
+ total = 0
+ edit = 0
+
+ for i in range(len(gt_content)):
+ total += 1
+
+ if gt_content[i] == recog_content[i]:
+ correct += 1
+
+ edit_num = self.edit_score(recog_content, gt_content)
+ edit += edit_num
+
+ tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)
+
+ # cls metric
+
+ precision = tp / float(tp + fp)
+ recall = tp / float(fp + fn)
+
+ if precision + recall > 0.0:
+ f1 = 2.0 * (precision * recall) / (precision + recall)
+ else:
+ f1 = 0.0
+ f1 = np.nan_to_num(f1)
+ return f1
+
+ def get_labels_start_end_time(self, frame_wise_labels):
+ labels = []
+ starts = []
+ ends = []
+ last_label = frame_wise_labels[0]
+ labels.append(frame_wise_labels[0])
+ starts.append(0)
+ for i in range(len(frame_wise_labels)):
+ if frame_wise_labels[i] != last_label:
+ labels.append(frame_wise_labels[i])
+ starts.append(i)
+ ends.append(i)
+ last_label = frame_wise_labels[i]
+ ends.append(i + 1)
+ return labels, starts, ends
+
+ def levenstein(self, p, y, norm=False):
+ m_row = len(p)
+ n_col = len(y)
+ D = np.zeros([m_row + 1, n_col + 1], np.float)
+ for i in range(m_row + 1):
+ D[i, 0] = i
+ for i in range(n_col + 1):
+ D[0, i] = i
+
+ for j in range(1, n_col + 1):
+ for i in range(1, m_row + 1):
+ if y[j - 1] == p[i - 1]:
+ D[i, j] = D[i - 1, j - 1]
+ else:
+ D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+ D[i - 1, j - 1] + 1)
+
+ if norm:
+ score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+ else:
+ score = D[-1, -1]
+
+ return score
+
+ def edit_score(self, recognized, ground_truth, norm=True):
+ P, _, _ = self.get_labels_start_end_time(recognized)
+ Y, _, _ = self.get_labels_start_end_time(ground_truth)
+ return self.levenstein(P, Y, norm)
+
+ def f_score(self, recognized, ground_truth, overlap):
+ p_label, p_start, p_end = self.get_labels_start_end_time(recognized)
+ y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)
+
+ tp = 0
+ fp = 0
+
+ hits = np.zeros(len(y_label))
+
+ for j in range(len(p_label)):
+ intersection = np.minimum(p_end[j], y_end) - np.maximum(
+ p_start[j], y_start)
+ union = np.maximum(p_end[j], y_end) - np.minimum(
+ p_start[j], y_start)
+ IoU = (1.0 * intersection / union) * (
+ [p_label[j] == y_label[x] for x in range(len(y_label))])
+ # Get the best scoring segment
+ idx = np.array(IoU).argmax()
+
+ if IoU[idx] >= overlap and not hits[idx]:
+ tp += 1
+ hits[idx] = 1
+ else:
+ fp += 1
+ fn = len(y_label) - sum(hits)
+ return float(tp), float(fp), float(fn)
diff --git a/paddlevideo/modeling/heads/attention_lstm_head.py b/paddlevideo/modeling/heads/attention_lstm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3415a307db2a4467e9f4d3a9abab6648feefcc7
--- /dev/null
+++ b/paddlevideo/modeling/heads/attention_lstm_head.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn.initializer import Normal
+from paddle.regularizer import L2Decay
+
+from ...metrics.youtube8m import eval_util as youtube8m_metrics
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class AttentionLstmHead(BaseHead):
+ """AttentionLstmHead.
+ Args: TODO
+ """
+ def __init__(self,
+ num_classes=3862,
+ feature_num=2,
+ feature_dims=[1024, 128],
+ embedding_size=512,
+ lstm_size=1024,
+ in_channels=2048,
+ loss_cfg=dict(name='CrossEntropyLoss')):
+ super(AttentionLstmHead, self).__init__(num_classes, in_channels,
+ loss_cfg)
+ self.num_classes = num_classes
+ self.feature_dims = feature_dims
+ self.embedding_size = embedding_size
+ self.lstm_size = lstm_size
+ self.feature_num = len(self.feature_dims)
+ for i in range(self.feature_num): # 0:rgb, 1:audio
+ fc_feature = paddle.nn.Linear(in_features=self.feature_dims[i],
+ out_features=self.embedding_size)
+ self.add_sublayer("fc_feature{}".format(i), fc_feature)
+
+ bi_lstm = paddle.nn.LSTM(input_size=self.embedding_size,
+ hidden_size=self.lstm_size,
+ direction="bidirectional")
+ self.add_sublayer("bi_lstm{}".format(i), bi_lstm)
+
+ drop_rate = 0.5
+ self.dropout = paddle.nn.Dropout(drop_rate)
+
+ att_fc = paddle.nn.Linear(in_features=self.lstm_size * 2,
+ out_features=1)
+ self.add_sublayer("att_fc{}".format(i), att_fc)
+ self.softmax = paddle.nn.Softmax()
+
+ self.fc_out1 = paddle.nn.Linear(in_features=self.lstm_size * 4,
+ out_features=8192,
+ bias_attr=ParamAttr(
+ regularizer=L2Decay(0.0),
+ initializer=Normal()))
+ self.relu = paddle.nn.ReLU()
+ self.fc_out2 = paddle.nn.Linear(in_features=8192,
+ out_features=4096,
+ bias_attr=ParamAttr(
+ regularizer=L2Decay(0.0),
+ initializer=Normal()))
+ self.fc_logit = paddle.nn.Linear(in_features=4096,
+ out_features=self.num_classes,
+ bias_attr=ParamAttr(
+ regularizer=L2Decay(0.0),
+ initializer=Normal()))
+ self.sigmoid = paddle.nn.Sigmoid()
+
+ def init_weights(self):
+ pass
+
+ def forward(self, inputs):
+ # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]
+ # deal with features with different length
+ # 1. padding to same lenght, make a tensor
+ # 2. make a mask tensor with the same shpae with 1
+ # 3. compute output using mask tensor, s.t. output is nothing todo with padding
+ assert (len(inputs) == self.feature_num
+ ), "Input tensor does not contain {} features".format(
+ self.feature_num)
+ att_outs = []
+ for i in range(len(inputs)):
+ # 1. fc
+ m = getattr(self, "fc_feature{}".format(i))
+ output_fc = m(inputs[i][0])
+ output_fc = paddle.tanh(output_fc)
+
+ # 2. bi_lstm
+ m = getattr(self, "bi_lstm{}".format(i))
+ lstm_out, _ = m(inputs=output_fc, sequence_length=inputs[i][1])
+
+ lstm_dropout = self.dropout(lstm_out)
+
+ # 3. att_fc
+ m = getattr(self, "att_fc{}".format(i))
+ lstm_weight = m(lstm_dropout)
+
+ # 4. softmax replace start, for it's relevant to sum in time step
+ lstm_exp = paddle.exp(lstm_weight)
+ lstm_mask = paddle.mean(inputs[i][2], axis=2)
+ lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)
+ lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)
+ lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)
+ exponent = -1
+ lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)
+ lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)
+ lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)
+ lstm_weight = lstm_softmax
+ # softmax replace end
+
+ lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)
+
+ # 5. sequence_pool's replace start, for it's relevant to sum in time step
+ lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)
+ fea_lens = inputs[i][1]
+ fea_len = int(fea_lens[0])
+ lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)
+ # sequence_pool's replace end
+ att_outs.append(lstm_pool)
+ att_out = paddle.concat(att_outs, axis=1)
+ fc_out1 = self.fc_out1(att_out)
+ fc_out1_act = self.relu(fc_out1)
+ fc_out2 = self.fc_out2(fc_out1_act)
+ fc_out2_act = paddle.tanh(fc_out2)
+ fc_logit = self.fc_logit(fc_out2_act)
+ output = self.sigmoid(fc_logit)
+ return fc_logit, output
+
+ def loss(self, lstm_logit, labels, **kwargs):
+ labels.stop_gradient = True
+ losses = dict()
+ bce_logit_loss = paddle.nn.BCEWithLogitsLoss(reduction='sum')
+ sum_cost = bce_logit_loss(lstm_logit, labels)
+ return sum_cost
+
+ def metric(self, lstm_output, labels):
+ pred = lstm_output.numpy()
+ label = labels.numpy()
+ hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)
+ perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(
+ pred, label)
+ gap = youtube8m_metrics.calculate_gap(pred, label)
+ return hit_at_one, perr, gap
diff --git a/paddlevideo/modeling/heads/base.py b/paddlevideo/modeling/heads/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..887fceef56d2c7b8083889c851428e33b5fb1bc1
--- /dev/null
+++ b/paddlevideo/modeling/heads/base.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from abc import abstractmethod
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..builder import build_loss
+from paddlevideo.utils import get_logger, get_dist_info
+
+logger = get_logger("paddlevideo")
+
+
+class BaseHead(nn.Layer):
+ """Base class for head part.
+
+ All head should subclass it.
+ All subclass should overwrite:
+
+ - Methods: ```init_weights```, initializing weights.
+ - Methods: ```forward```, forward function.
+
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channels in input feature.
+ loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').
+ ls_eps (float): label smoothing epsilon. Default: 0. .
+
+ """
+
+ def __init__(
+ self,
+ num_classes=None,
+ in_channels=None,
+ loss_cfg=dict(
+ name="CrossEntropyLoss"
+ ), #TODO(shipping): only pass a name or standard build cfg format.
+ #multi_class=False, NOTE(shipping): not supported now.
+ ls_eps=0.):
+
+ super().__init__()
+ self.num_classes = num_classes
+ self.in_channels = in_channels
+ self.loss_func = build_loss(loss_cfg)
+ #self.multi_class = multi_class NOTE(shipping): not supported now
+ self.ls_eps = ls_eps
+
+ @abstractmethod
+ def forward(self, x):
+ """Define how the head is going to run.
+ """
+ raise NotImplemented
+
+ def loss(self, scores, labels, valid_mode=False, if_top5=True, **kwargs):
+ """Calculate the loss accroding to the model output ```scores```,
+ and the target ```labels```.
+
+ Args:
+ scores (paddle.Tensor): The output of the model.
+ labels (paddle.Tensor): The target output of the model.
+
+ Returns:
+ losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).
+
+ """
+ if len(labels) == 1: #commonly case
+ labels = labels[0]
+ losses = dict()
+ if self.ls_eps != 0. and not valid_mode: # label_smooth
+ loss = self.label_smooth_loss(scores, labels, **kwargs)
+ else:
+ loss = self.loss_func(scores, labels, **kwargs)
+ if if_top5:
+ top1, top5 = self.get_acc(scores, labels, valid_mode)
+ losses['top1'] = top1
+ losses['top5'] = top5
+ losses['loss'] = loss
+ else:
+ top1 = self.get_acc(scores, labels, valid_mode, if_top5)
+ losses['top1'] = top1
+ losses['loss'] = loss
+ return losses
+ # MRI目前二分类无top5
+ elif len(labels) == 3: # mix_up
+ labels_a, labels_b, lam = labels
+ lam = lam[0] # get lam value
+ losses = dict()
+ if self.ls_eps != 0:
+ loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)
+ loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)
+ else:
+ loss_a = self.loss_func(scores, labels_a, **kwargs)
+ loss_b = self.loss_func(scores, labels_b, **kwargs)
+ loss = lam * loss_a + (1 - lam) * loss_b
+
+ if if_top5:
+ top1a, top5a = self.get_acc(scores, labels_a, valid_mode)
+ top1b, top5b = self.get_acc(scores, labels_b, valid_mode)
+ top1 = lam * top1a + (1 - lam) * top1b
+ top5 = lam * top5a + (1 - lam) * top5b
+ losses['top1'] = top1
+ losses['top5'] = top5
+ losses['loss'] = loss
+
+ else:
+ top1a = self.get_acc(scores, labels_a, valid_mode, if_top5)
+ top1b = self.get_acc(scores, labels_b, valid_mode, if_top5)
+ top1 = lam * top1a + (1 - lam) * top1b
+ losses['top1'] = top1
+ losses['loss'] = loss
+
+ return losses
+ else:
+ raise NotImplemented
+
+ def label_smooth_loss(self, scores, labels, **kwargs):
+ """
+ Args:
+ scores (paddle.Tensor): [N, num_classes]
+ labels (paddle.Tensor): [N, ]
+ Returns:
+ paddle.Tensor: [1,]
+ """
+ if paddle.fluid.core.is_compiled_with_npu():
+ """
+ Designed for the lack of temporary operators of NPU,
+ main idea is to split smooth loss into uniform distribution loss
+ and hard label calculation
+ """
+ hard_loss = (1.0 - self.ls_eps) * F.cross_entropy(scores, labels)
+ uniform_loss = (self.ls_eps / self.num_classes) * (
+ -F.log_softmax(scores, -1).sum(-1).mean(0))
+ loss = hard_loss + uniform_loss
+ else:
+ labels = F.one_hot(labels, self.num_classes)
+ labels = F.label_smooth(labels, epsilon=self.ls_eps)
+ labels = paddle.squeeze(labels, axis=1)
+ loss = self.loss_func(scores, labels, soft_label=True, **kwargs)
+ return loss
+
+ def get_acc(self, scores, labels, valid_mode, if_top5=True):
+ if if_top5:
+ top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+ top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)
+ _, world_size = get_dist_info()
+ #NOTE(shipping): deal with multi cards validate
+ if world_size > 1 and valid_mode: #reduce sum when valid
+ top1 = paddle.distributed.all_reduce(
+ top1, op=paddle.distributed.ReduceOp.SUM) / world_size
+ top5 = paddle.distributed.all_reduce(
+ top5, op=paddle.distributed.ReduceOp.SUM) / world_size
+
+ return top1, top5
+ else:
+ top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+ _, world_size = get_dist_info()
+ #NOTE(shipping): deal with multi cards validate
+ if world_size > 1 and valid_mode: #reduce sum when valid
+ top1 = paddle.distributed.all_reduce(
+ top1, op=paddle.distributed.ReduceOp.SUM) / world_size
+
+ return top1
diff --git a/paddlevideo/modeling/heads/bbox_head.py b/paddlevideo/modeling/heads/bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..688251ebb8499df88a2e544428a5af03fb74ff10
--- /dev/null
+++ b/paddlevideo/modeling/heads/bbox_head.py
@@ -0,0 +1,225 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from .. import builder
+
+from ..registry import HEADS
+
+@HEADS.register()
+class BBoxHeadAVA(nn.Layer):
+ """Simplest RoI head, with only two fc layers for classification and
+ regression respectively. """
+
+ def __init__(
+ self,
+ temporal_pool_type='avg',
+ spatial_pool_type='max',
+ in_channels=2048,
+ num_classes=81,# The first class is reserved, to classify bbox as pos / neg
+ dropout_ratio=0,
+ dropout_before_pool=True,
+ topk=(3, 5),
+ multilabel=True):
+
+ super(BBoxHeadAVA, self).__init__()
+ assert temporal_pool_type in ['max', 'avg']
+ assert spatial_pool_type in ['max', 'avg']
+ self.temporal_pool_type = temporal_pool_type
+ self.spatial_pool_type = spatial_pool_type
+
+ self.in_channels = in_channels
+ self.num_classes = num_classes
+
+ self.dropout_ratio = dropout_ratio
+ self.dropout_before_pool = dropout_before_pool
+
+ self.multilabel = multilabel
+ if topk is None:
+ self.topk = ()
+ elif isinstance(topk, int):
+ self.topk = (topk, )
+ elif isinstance(topk, tuple):
+ assert all([isinstance(k, int) for k in topk])
+ self.topk = topk
+ else:
+ raise TypeError('topk should be int or tuple[int], '
+ f'but get {type(topk)}')
+ # Class 0 is ignored when calculaing multilabel accuracy,
+ # so topk cannot be equal to num_classes
+ assert all([k < num_classes for k in self.topk])
+ assert self.multilabel
+
+ in_channels = self.in_channels
+ if self.temporal_pool_type == 'avg':
+ self.temporal_pool = nn.AdaptiveAvgPool3D((1, None, None))
+ else:
+ self.temporal_pool = nn.AdaptiveMaxPool3D((1, None, None))
+ if self.spatial_pool_type == 'avg':
+ self.spatial_pool = nn.AdaptiveAvgPool3D((None, 1, 1))
+ else:
+ self.spatial_pool = nn.AdaptiveMaxPool3D((None, 1, 1))
+
+ if dropout_ratio > 0:
+ self.dropout = nn.Dropout(dropout_ratio)
+
+ weight_attr = paddle.framework.ParamAttr(name="weight",
+ initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.01))
+ bias_attr = paddle.ParamAttr(name="bias",
+ initializer=paddle.nn.initializer.Constant(value=0.0))
+
+ self.fc_cls = nn.Linear(in_channels, num_classes, weight_attr=weight_attr, bias_attr=bias_attr)
+
+ self.debug_imgs = None
+
+ def forward(self, x,rois, rois_num):
+ roi = paddle.concat(rois)
+ roi_x1 = paddle.index_select(roi, index=paddle.to_tensor(0), axis=1)
+ roi_x2 = paddle.index_select(roi, index=paddle.to_tensor(2), axis=1)
+ roi_w = roi_x2 - roi_x1
+ roi_y1 = paddle.index_select(roi, index=paddle.to_tensor(1), axis=1)
+ roi_y2 = paddle.index_select(roi, index=paddle.to_tensor(3), axis=1)
+ roi_h = roi_y2 - roi_y1
+ roi_area = paddle.multiply(roi_w, roi_h)
+ A = roi_area
+ A1 = paddle.full(A.shape, 1, dtype='int32')
+ A2 = paddle.where(A == 0, paddle.zeros_like(A1), A1)
+ AE = paddle.expand(A2, [A.shape[0], x.shape[1]])
+ rois_num = paddle.to_tensor(rois_num, dtype='int32')
+ if self.dropout_before_pool and self.dropout_ratio > 0 :
+ x = self.dropout(x)
+ x = self.temporal_pool(x)
+ x = self.spatial_pool(x)
+ if not self.dropout_before_pool and self.dropout_ratio > 0 :
+ x = self.dropout(x)
+ x = paddle.reshape(x, [x.shape[0], -1])
+ x = paddle.multiply(x, paddle.cast(AE,"float32"))
+ cls_score = self.fc_cls(x)
+ # We do not predict bbox, so return None
+ return cls_score, None
+
+ def get_targets(self, sampling_results, gt_bboxes, gt_labels, pos_weight):
+ pos_proposals = [res.pos_bboxes for res in sampling_results]
+ neg_proposals = [res.neg_bboxes for res in sampling_results]
+ pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+ cls_reg_targets = self.bbox_target(pos_proposals, neg_proposals,
+ pos_gt_labels, pos_weight)
+ return cls_reg_targets
+
+ def bbox_target(self, pos_bboxes_list, neg_bboxes_list, gt_labels, pos_weight):
+ """Generate classification targets for bboxes. """
+ labels, label_weights = [], []
+ pos_weight = 1.0 if pos_weight <= 0 else pos_weight
+
+ assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels)
+ length = len(pos_bboxes_list)
+
+ for i in range(length):
+ pos_bboxes = pos_bboxes_list[i]
+ neg_bboxes = neg_bboxes_list[i]
+ gt_label = gt_labels[i]
+ num_pos = pos_bboxes.shape[0]
+ if neg_bboxes is not None:
+ num_neg = neg_bboxes.shape[0]
+ else:
+ num_neg = 0
+ num_samples = num_pos + num_neg
+ neg_label = paddle.zeros([num_neg, gt_label.shape[1]])
+ label = paddle.concat([gt_label,neg_label])
+ labels.append(label)
+
+ labels = paddle.concat(labels, 0)
+ return labels
+
+ def recall_prec(self, pred_vec, target_vec):
+ correct = paddle.to_tensor(np.logical_and(pred_vec.numpy(), target_vec.numpy()))
+ correct = paddle.where(correct,
+ paddle.full(correct.shape,1,dtype='int32'),
+ paddle.full(correct.shape,0,dtype='int32'))
+ recall_correct = paddle.cast(paddle.sum(correct, axis=1), 'float32')
+ target_vec = paddle.where(target_vec,
+ paddle.full(target_vec.shape,1,dtype='int32'),
+ paddle.full(target_vec.shape,0,dtype='int32'))
+ recall_target = paddle.cast(paddle.sum(target_vec, axis=1),'float32')
+ recall = recall_correct / recall_target
+ pred_vec = paddle.where(pred_vec,
+ paddle.full(pred_vec.shape,1,dtype='int32'),
+ paddle.full(pred_vec.shape,0,dtype='int32'))
+ prec_target = paddle.cast(paddle.sum(pred_vec, axis=1) + 1e-6, 'float32')
+ prec = recall_correct / prec_target
+ recall_mean = paddle.mean(recall)
+ prec_mean = paddle.mean(prec)
+ return recall_mean, prec_mean
+
+ def multilabel_accuracy(self, pred, target, thr=0.5):
+ pred = paddle.nn.functional.sigmoid(pred)
+ pred_vec = pred > thr
+ target_vec = target > 0.5
+ recall_thr, prec_thr = self.recall_prec(pred_vec, target_vec)
+ recalls, precs = [], []
+ for k in self.topk:
+ _, pred_label = paddle.topk(pred, k, 1, True, True)
+ pred_vec = paddle.full(pred.shape,0,dtype='bool')
+ num_sample = pred.shape[0]
+ for i in range(num_sample):
+ pred_vec[i, pred_label[i].numpy()] = 1
+ recall_k, prec_k = self.recall_prec(pred_vec, target_vec)
+ recalls.append(recall_k)
+ precs.append(prec_k)
+ return recall_thr, prec_thr, recalls, precs
+
+ def loss(self,
+ cls_score,
+ labels):
+ losses = dict()
+ if cls_score is not None:
+ # Only use the cls_score
+ labels = labels[:, 1:]
+ pos_inds_bool = paddle.sum(labels, axis=-1) > 0
+ pos_inds = paddle.where(paddle.sum(labels, axis=-1) > 0,
+ paddle.full([labels.shape[0]],1,dtype='int32'),
+ paddle.full([labels.shape[0]],0,dtype='int32'))
+ pos_inds = paddle.nonzero(pos_inds, as_tuple=False)
+ cls_score = paddle.index_select(cls_score, pos_inds, axis=0)
+ cls_score = cls_score[:, 1:]
+ labels = paddle.index_select(labels, pos_inds, axis=0)
+ bce_loss = F.binary_cross_entropy_with_logits
+ loss = bce_loss(cls_score, labels, reduction='none')
+ losses['loss'] = paddle.mean(loss)
+ recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy(
+ cls_score, labels, thr=0.5)
+ losses['recall@thr=0.5'] = recall_thr
+ losses['prec@thr=0.5'] = prec_thr
+ for i, k in enumerate(self.topk):
+ losses[f'recall@top{k}'] = recall_k[i]
+ losses[f'prec@top{k}'] = prec_k[i]
+ return losses
+
+ def get_det_bboxes(self,
+ rois,
+ cls_score,
+ img_shape,
+ flip=False,
+ crop_quadruple=None,
+ cfg=None):
+ if isinstance(cls_score, list):
+ cls_score = sum(cls_score) / float(len(cls_score))
+ assert self.multilabel
+ m = paddle.nn.Sigmoid()
+ scores = m(cls_score)
+ bboxes = rois
+ return bboxes, scores
diff --git a/paddlevideo/modeling/heads/cfbi_head.py b/paddlevideo/modeling/heads/cfbi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7cbd910ef3c233eb4313d9506cee2cefd1e9746
--- /dev/null
+++ b/paddlevideo/modeling/heads/cfbi_head.py
@@ -0,0 +1,448 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+class IA_gate(nn.Layer):
+ def __init__(self, in_dim, out_dim):
+ super(IA_gate, self).__init__()
+ self.IA = nn.Linear(in_dim, out_dim)
+
+ def forward(self, x, IA_head):
+ a = self.IA(IA_head)
+ a = 1. + paddle.tanh(a)
+ a = paddle.unsqueeze(paddle.unsqueeze(a, axis=-1), axis=-1)
+ x = a * x
+ return x
+
+
+class GCT(nn.Layer):
+ def __init__(self, num_channels, epsilon=1e-5, mode='l2', after_relu=False):
+ super(GCT, self).__init__()
+ x1 = paddle.zeros([1, num_channels, 1, 1])
+ x2 = paddle.ones([1, num_channels, 1, 1])
+ self.alpha = paddle.create_parameter(
+ shape=x2.shape,
+ dtype=x2.dtype,
+ default_initializer=nn.initializer.Assign(x2))
+ self.alpha.stop_gradient = False
+ self.gamma = paddle.create_parameter(
+ shape=x1.shape,
+ dtype=x1.dtype,
+ default_initializer=nn.initializer.Assign(x1))
+ self.gamma.stop_gradient = False
+ self.beta = paddle.create_parameter(
+ shape=x1.shape,
+ dtype=x1.dtype,
+ default_initializer=nn.initializer.Assign(x1))
+ self.beta.stop_gradient = False
+
+ self.epsilon = epsilon
+ self.mode = mode
+ self.after_relu = after_relu
+
+ def forward(self, x):
+
+ if self.mode == 'l2':
+ embedding = paddle.pow(
+ paddle.sum(paddle.pow(x, 2), axis=[2, 3], keepdim=True) +
+ self.epsilon, 0.5) * self.alpha
+ norm = self.gamma / paddle.pow(
+ (paddle.mean(paddle.pow(embedding, 2), axis=1, keepdim=True) +
+ self.epsilon), 0.5)
+ elif self.mode == 'l1':
+ if not self.after_relu:
+ _x = paddle.abs(x)
+ else:
+ _x = x
+ embedding = paddle.sum(_x, axis=(2, 3), keepdim=True) * self.alpha
+ norm = self.gamma / (paddle.mean(
+ paddle.abs(embedding), axis=1, keepdim=True) + self.epsilon)
+ else:
+ print('Unknown mode!')
+ exit()
+
+ gate = 1. + paddle.tanh(embedding * norm + self.beta)
+
+ return x * gate
+
+
+class Bottleneck(nn.Layer):
+ def __init__(self, inplanes, outplanes, stride=1, dilation=1):
+ super(Bottleneck, self).__init__()
+ expansion = 4
+ planes = int(outplanes / expansion)
+
+ self.GCT1 = GCT(inplanes)
+ self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+ self.bn1 = nn.GroupNorm(num_groups=32, num_channels=planes)
+
+ self.conv2 = nn.Conv2D(planes,
+ planes,
+ kernel_size=3,
+ stride=stride,
+ dilation=dilation,
+ padding=dilation,
+ bias_attr=False)
+ self.bn2 = nn.GroupNorm(num_groups=32, num_channels=planes)
+
+ self.conv3 = nn.Conv2D(planes,
+ planes * expansion,
+ kernel_size=1,
+ bias_attr=False)
+ self.bn3 = nn.GroupNorm(num_groups=32, num_channels=planes * expansion)
+ self.relu = nn.ReLU()
+ if stride != 1 or inplanes != planes * expansion:
+ downsample = nn.Sequential(
+ nn.Conv2D(inplanes,
+ planes * expansion,
+ kernel_size=1,
+ stride=stride,
+ bias_attr=False),
+ nn.GroupNorm(num_groups=32, num_channels=planes * expansion),
+ )
+ else:
+ downsample = None
+ self.downsample = downsample
+
+ self.stride = stride
+ self.dilation = dilation
+
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ nn.initializer.KaimingNormal()
+
+ def forward(self, x):
+ residual = x
+
+ out = self.GCT1(x)
+ out = self.conv1(out)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class _ASPPModule(nn.Layer):
+ def __init__(self, inplanes, planes, kernel_size, padding, dilation):
+ super(_ASPPModule, self).__init__()
+ self.GCT = GCT(inplanes)
+ self.atrous_conv = nn.Conv2D(inplanes,
+ planes,
+ kernel_size=kernel_size,
+ stride=1,
+ padding=padding,
+ dilation=dilation,
+ bias_attr=False)
+ self.bn = nn.GroupNorm(num_groups=int(planes / 4), num_channels=planes)
+ self.relu = nn.ReLU()
+
+ self._init_weight()
+
+ def forward(self, x):
+ x = self.GCT(x)
+ x = self.atrous_conv(x)
+ x = self.bn(x)
+
+ return self.relu(x)
+
+ def _init_weight(self):
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ nn.initializer.KaimingNormal()
+ elif isinstance(m, nn.GroupNorm):
+ m.weight.data = nn.initializer.Constant(1)
+ m.bias.data = nn.initializer.Constant(0)
+
+
+class ASPP(nn.Layer):
+ def __init__(self):
+ super(ASPP, self).__init__()
+
+ inplanes = 512
+ dilations = [1, 6, 12, 18]
+
+ self.aspp1 = _ASPPModule(inplanes,
+ 128,
+ 1,
+ padding=0,
+ dilation=dilations[0])
+ self.aspp2 = _ASPPModule(inplanes,
+ 128,
+ 3,
+ padding=dilations[1],
+ dilation=dilations[1])
+ self.aspp3 = _ASPPModule(inplanes,
+ 128,
+ 3,
+ padding=dilations[2],
+ dilation=dilations[2])
+ self.aspp4 = _ASPPModule(inplanes,
+ 128,
+ 3,
+ padding=dilations[3],
+ dilation=dilations[3])
+
+ self.global_avg_pool = nn.Sequential(
+ nn.AdaptiveAvgPool2D((1, 1)),
+ nn.Conv2D(inplanes, 128, 1, stride=1, bias_attr=False), nn.ReLU())
+
+ self.GCT = GCT(640)
+ self.conv1 = nn.Conv2D(640, 256, 1, bias_attr=False)
+ self.bn1 = nn.GroupNorm(num_groups=32, num_channels=256)
+ self.relu = nn.ReLU()
+ self._init_weight()
+
+ def forward(self, x):
+ x1 = self.aspp1(x)
+ x2 = self.aspp2(x)
+ x3 = self.aspp3(x)
+ x4 = self.aspp4(x)
+ x5 = self.global_avg_pool(x)
+ x5 = F.interpolate(x5,
+ size=x4.shape[2:],
+ mode='bilinear',
+ align_corners=True)
+ x = paddle.concat([x1, x2, x3, x4, x5], axis=1)
+
+ x = self.GCT(x)
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+
+ return x
+
+ def _init_weight(self):
+ for m in self.sublayers():
+ if isinstance(m, nn.Conv2D):
+ nn.initializer.KaimingNormal()
+ elif isinstance(m, nn.GroupNorm):
+ m.weight.data = nn.initializer.Constant(1)
+ m.bias.data = nn.initializer.Constant(0)
+
+
+@HEADS.register()
+class CollaborativeEnsemblerMS(nn.Layer):
+ def __init__(
+ self,
+ model_semantic_embedding_dim=256,
+ model_multi_local_distance=[[4, 8, 12, 16, 20, 24],
+ [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]],
+ model_head_embedding_dim=256,
+ model_refine_channels=64,
+ model_low_level_inplanes=256,
+ ):
+ super(CollaborativeEnsemblerMS, self).__init__()
+ in_dim_4x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+ model_multi_local_distance[0])
+ in_dim_8x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+ model_multi_local_distance[1])
+ in_dim_16x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+ model_multi_local_distance[2])
+ attention_dim = model_semantic_embedding_dim * 4
+ embed_dim = model_head_embedding_dim
+ refine_dim = model_refine_channels
+ low_level_dim = model_low_level_inplanes
+
+ IA_in_dim = attention_dim
+
+ self.relu = nn.ReLU()
+
+ # stage 1
+
+ self.S1_IA1 = IA_gate(IA_in_dim, in_dim_4x)
+ self.S1_layer1 = Bottleneck(in_dim_4x, embed_dim)
+
+ self.S1_IA2 = IA_gate(IA_in_dim, embed_dim)
+ self.S1_layer2 = Bottleneck(embed_dim, embed_dim, 1, 2)
+
+ # stage2
+ self.S2_IA1 = IA_gate(IA_in_dim, embed_dim)
+ self.S2_layer1 = Bottleneck(embed_dim, embed_dim * 2, 2)
+
+ self.S2_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_8x)
+ self.S2_layer2 = Bottleneck(embed_dim * 2 + in_dim_8x, embed_dim * 2, 1,
+ 2)
+
+ self.S2_IA3 = IA_gate(IA_in_dim, embed_dim * 2)
+ self.S2_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)
+
+ # stage3
+ self.S3_IA1 = IA_gate(IA_in_dim, embed_dim * 2)
+ self.S3_layer1 = Bottleneck(embed_dim * 2, embed_dim * 2, 2)
+
+ self.S3_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_16x)
+ self.S3_layer2 = Bottleneck(embed_dim * 2 + in_dim_16x, embed_dim * 2,
+ 1, 2)
+
+ self.S3_IA3 = IA_gate(IA_in_dim, embed_dim * 2)
+ self.S3_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)
+
+ self.ASPP_IA = IA_gate(IA_in_dim, embed_dim * 2)
+ self.ASPP = ASPP()
+
+ # Decoder
+ self.GCT_sc = GCT(low_level_dim + embed_dim)
+ self.conv_sc = nn.Conv2D(low_level_dim + embed_dim,
+ refine_dim,
+ 1,
+ bias_attr=False)
+ self.bn_sc = nn.GroupNorm(num_groups=int(refine_dim / 4),
+ num_channels=refine_dim)
+ self.relu = nn.ReLU()
+
+ self.IA10 = IA_gate(IA_in_dim, embed_dim + refine_dim)
+ self.conv1 = nn.Conv2D(embed_dim + refine_dim,
+ int(embed_dim / 2),
+ kernel_size=3,
+ padding=1,
+ bias_attr=False)
+ self.bn1 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))
+
+ self.IA11 = IA_gate(IA_in_dim, int(embed_dim / 2))
+ self.conv2 = nn.Conv2D(int(embed_dim / 2),
+ int(embed_dim / 2),
+ kernel_size=3,
+ padding=1,
+ bias_attr=False)
+ self.bn2 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))
+
+ # Output
+ self.IA_final_fg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)
+ self.IA_final_bg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)
+
+ self.conv_sc.weight.data = nn.initializer.KaimingNormal()
+ self.conv1.weight.data = nn.initializer.KaimingNormal()
+ self.conv2.weight.data = nn.initializer.KaimingNormal()
+
+ def forward(self, all_x, all_IA_head=None, low_level_feat=None):
+ x_4x, x_8x, x_16x = all_x
+ IA_head = all_IA_head[0]
+
+ # stage 1
+ x = self.S1_IA1(x_4x, IA_head)
+ x = self.S1_layer1(x)
+
+ x = self.S1_IA2(x, IA_head)
+ x = self.S1_layer2(x)
+
+ low_level_feat = paddle.concat(
+ [paddle.expand(low_level_feat, [x.shape[0], -1, -1, -1]), x],
+ axis=1)
+
+ # stage 2
+ x = self.S2_IA1(x, IA_head)
+ x = self.S2_layer1(x)
+
+ x = paddle.concat([x, x_8x], axis=1)
+ x = self.S2_IA2(x, IA_head)
+ x = self.S2_layer2(x)
+
+ x = self.S2_IA3(x, IA_head)
+ x = self.S2_layer3(x)
+
+ # stage 3
+ x = self.S3_IA1(x, IA_head)
+ x = self.S3_layer1(x)
+
+ x = paddle.concat([x, x_16x], axis=1)
+ x = self.S3_IA2(x, IA_head)
+ x = self.S3_layer2(x)
+
+ x = self.S3_IA3(x, IA_head)
+ x = self.S3_layer3(x)
+
+ # ASPP + Decoder
+ x = self.ASPP_IA(x, IA_head)
+ x = self.ASPP(x)
+
+ x = self.decoder(x, low_level_feat, IA_head)
+
+ fg_logit = self.IA_logit(x, IA_head, self.IA_final_fg)
+ bg_logit = self.IA_logit(x, IA_head, self.IA_final_bg)
+
+ pred = self.augment_background_logit(fg_logit, bg_logit)
+
+ return pred
+
+ def IA_logit(self, x, IA_head, IA_final):
+ n, c, h, w = x.shape
+ x = paddle.reshape(x, [1, n * c, h, w])
+ IA_output = IA_final(IA_head)
+ IA_weight = IA_output[:, :c]
+ IA_bias = IA_output[:, -1]
+ IA_weight = paddle.reshape(IA_weight, [n, c, 1, 1])
+
+ IA_bias = paddle.reshape(IA_bias, [-1])
+ logit = paddle.reshape(
+ F.conv2d(x, weight=IA_weight, bias=IA_bias, groups=n), [n, 1, h, w])
+ return logit
+
+ def decoder(self, x, low_level_feat, IA_head):
+ x = F.interpolate(x,
+ size=low_level_feat.shape[2:],
+ mode='bicubic',
+ align_corners=True)
+
+ low_level_feat = self.GCT_sc(low_level_feat)
+ low_level_feat = self.conv_sc(low_level_feat)
+ low_level_feat = self.bn_sc(low_level_feat)
+ low_level_feat = self.relu(low_level_feat)
+
+ x = paddle.concat([x, low_level_feat], axis=1)
+ x = self.IA10(x, IA_head)
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+
+ x = self.IA11(x, IA_head)
+ x = self.conv2(x)
+ x = self.bn2(x)
+ x = self.relu(x)
+
+ return x
+
+ def augment_background_logit(self, fg_logit, bg_logit):
+ # We augment the logit of absolute background by using the relative background logit of all the
+ # foreground objects.
+ obj_num = fg_logit.shape[0]
+ pred = fg_logit
+ if obj_num > 1:
+ bg_logit = bg_logit[1:obj_num, :, :, :]
+ aug_bg_logit = paddle.min(bg_logit, axis=0, keepdim=True)
+ pad = paddle.expand(paddle.zeros(aug_bg_logit.shape),
+ [obj_num - 1, -1, -1, -1])
+ aug_bg_logit = paddle.concat([aug_bg_logit, pad], axis=0)
+ pred = pred + aug_bg_logit
+ pred = paddle.transpose(pred, [1, 0, 2, 3])
+ return pred
diff --git a/paddlevideo/modeling/heads/ctrgcn_head.py b/paddlevideo/modeling/heads/ctrgcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c551d0d3ea9e8bde7f33b90079f6e94ffcb5a433
--- /dev/null
+++ b/paddlevideo/modeling/heads/ctrgcn_head.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class CTRGCNHead(BaseHead):
+ """
+ Head for CTR-GCN model.
+ Args:
+ in_channels: int, input feature channels. Default: 64.
+ num_classes: int, output the number of classes.
+ drop_out: float, dropout ratio of layer. Default: 0.
+ """
+
+ def __init__(self, in_channels=64, num_classes=10, drop_out=0, **kwargs):
+ super().__init__(num_classes, in_channels, **kwargs)
+ self.in_channels = in_channels
+ self.drop_out = drop_out
+
+ self.fc = nn.Linear(self.in_channels * 4, self.num_classes)
+ if drop_out:
+ self.drop_out = nn.Dropout(self.drop_out)
+ else:
+ self.drop_out = lambda x: x
+
+ def init_weights(self):
+ """Initiate the parameters.
+ """
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ weight_init_(layer.weight,
+ 'Normal',
+ mean=0.0,
+ std=math.sqrt(2. / self.num_classes))
+
+ def forward(self, output_patch):
+ """Define how the head is going to run.
+ """
+ x, N, M = output_patch
+ # N*M,C,T,V
+ _, c_new, T, V = x.shape
+ x = paddle.reshape(x, shape=[N, M, c_new, T * V])
+ x = x.mean(3).mean(1)
+ x = self.drop_out(x)
+
+ return self.fc(x)
diff --git a/paddlevideo/modeling/heads/i3d_head.py b/paddlevideo/modeling/heads/i3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..269c8184e46952292be4a849597afa9d24e723e7
--- /dev/null
+++ b/paddlevideo/modeling/heads/i3d_head.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class I3DHead(BaseHead):
+ """Classification head for I3D.
+
+ Args:
+ num_classes (int): Number of classes to be classified.
+ in_channels (int): Number of channels in input feature.
+ loss_cls (dict): Config for building loss.
+ Default: dict(name='CrossEntropyLoss')
+ spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+ drop_ratio (float): Probability of dropout layer. Default: 0.5.
+ std (float): Std value for Initiation. Default: 0.01.
+ kwargs (dict, optional): Any keyword argument to be used to initialize
+ the head.
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ loss_cfg=dict(name='CrossEntropyLoss'),
+ spatial_type='avg',
+ drop_ratio=0.5,
+ std=0.01,
+ **kwargs):
+
+ super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+
+ self.spatial_type = spatial_type
+ self.drop_ratio = drop_ratio
+ self.stdv = std
+ if self.drop_ratio != 0:
+ self.dropout = nn.Dropout(p=self.drop_ratio)
+ else:
+ self.dropout = None
+ self.fc = nn.Linear(
+ self.in_channels,
+ self.num_classes,
+ weight_attr=ParamAttr(learning_rate=10.0),
+ bias_attr=ParamAttr(learning_rate=10.0),
+ )
+
+ if self.spatial_type == 'avg':
+ # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.
+ self.avg_pool = nn.AdaptiveAvgPool3D((1, 1, 1))
+ else:
+ self.avg_pool = None
+
+ def init_weights(self):
+ """Initiate the parameters from scratch."""
+ weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+ def forward(self, x):
+ """Defines the computation performed at every call.
+
+ Args:
+ x (torch.Tensor): The input data.
+
+ Returns:
+ torch.Tensor: The classification scores for input samples.
+ """
+ # [N, in_channels, 4, 7, 7]
+ if self.avg_pool is not None:
+ x = self.avg_pool(x)
+ # [N, in_channels, 1, 1, 1]
+ if self.dropout is not None:
+ x = self.dropout(x)
+ # [N, in_channels, 1, 1, 1]
+ N = paddle.shape(x)[0]
+ x = x.reshape([N, -1])
+ # [N, in_channels]
+ cls_score = self.fc(x)
+ # [N, num_classes]
+ return cls_score
diff --git a/paddlevideo/modeling/heads/movinet_head.py b/paddlevideo/modeling/heads/movinet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7db42e807fa0064fa8582a9b256e0996fbe85b0
--- /dev/null
+++ b/paddlevideo/modeling/heads/movinet_head.py
@@ -0,0 +1,16 @@
+import collections.abc
+
+container_abcs = collections.abc
+from ..registry import HEADS
+from .base import BaseHead
+from ..builder import build_loss
+
+
+@HEADS.register()
+class MoViNetHead(BaseHead):
+
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, x):
+ return x
diff --git a/paddlevideo/modeling/heads/ms_tcn_head.py b/paddlevideo/modeling/heads/ms_tcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0f435f2a5c2e14ab1747fa8d9d81b144ccf01e4
--- /dev/null
+++ b/paddlevideo/modeling/heads/ms_tcn_head.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddle import ParamAttr
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class MSTCNHead(BaseHead):
+
+ def __init__(self, num_classes, in_channels):
+ super().__init__(num_classes, in_channels)
+ self.ce = nn.CrossEntropyLoss(ignore_index=-100)
+ self.mse = nn.MSELoss(reduction='none')
+ self.num_classes = num_classes
+
+ # cls score
+ self.overlap = 0.5
+
+ def forward(self, x):
+ """MS-TCN no head
+ """
+ return x
+
+ def loss(self, output, video_gt):
+ """calculate loss
+ """
+ output_transpose = paddle.transpose(output, [2, 0, 1])
+ ce_x = paddle.reshape(output_transpose,
+ (output_transpose.shape[0] *
+ output_transpose.shape[1], self.num_classes))
+ ce_y = video_gt[0, :]
+ ce_loss = self.ce(ce_x, ce_y)
+ loss = ce_loss
+
+ mse = self.mse(F.log_softmax(output[:, :, 1:], axis=1),
+ F.log_softmax(output.detach()[:, :, :-1], axis=1))
+ mse = paddle.clip(mse, min=0, max=16)
+ mse_loss = 0.15 * paddle.mean(mse)
+ loss += mse_loss
+
+ return loss
+
+ def get_F1_score(self, predicted, groundTruth):
+ recog_content = list(predicted.numpy())
+ gt_content = list(groundTruth[0].numpy())
+
+ # cls score
+ correct = 0
+ total = 0
+ edit = 0
+
+ for i in range(len(gt_content)):
+ total += 1
+
+ if gt_content[i] == recog_content[i]:
+ correct += 1
+
+ edit_num = self.edit_score(recog_content, gt_content)
+ edit += edit_num
+
+ tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)
+
+ # cls metric
+
+ precision = tp / float(tp + fp)
+ recall = tp / float(fp + fn)
+
+ if precision + recall > 0.0:
+ f1 = 2.0 * (precision * recall) / (precision + recall)
+ else:
+ f1 = 0.0
+ f1 = np.nan_to_num(f1)
+ return f1
+
+ def get_labels_start_end_time(self, frame_wise_labels):
+ labels = []
+ starts = []
+ ends = []
+ last_label = frame_wise_labels[0]
+ labels.append(frame_wise_labels[0])
+ starts.append(0)
+ for i in range(len(frame_wise_labels)):
+ if frame_wise_labels[i] != last_label:
+ labels.append(frame_wise_labels[i])
+ starts.append(i)
+ ends.append(i)
+ last_label = frame_wise_labels[i]
+ ends.append(i + 1)
+ return labels, starts, ends
+
+ def levenstein(self, p, y, norm=False):
+ m_row = len(p)
+ n_col = len(y)
+ D = np.zeros([m_row + 1, n_col + 1], np.float)
+ for i in range(m_row + 1):
+ D[i, 0] = i
+ for i in range(n_col + 1):
+ D[0, i] = i
+
+ for j in range(1, n_col + 1):
+ for i in range(1, m_row + 1):
+ if y[j - 1] == p[i - 1]:
+ D[i, j] = D[i - 1, j - 1]
+ else:
+ D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+ D[i - 1, j - 1] + 1)
+
+ if norm:
+ score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+ else:
+ score = D[-1, -1]
+
+ return score
+
+ def edit_score(self, recognized, ground_truth, norm=True):
+ P, _, _ = self.get_labels_start_end_time(recognized)
+ Y, _, _ = self.get_labels_start_end_time(ground_truth)
+ return self.levenstein(P, Y, norm)
+
+ def f_score(self, recognized, ground_truth, overlap):
+ p_label, p_start, p_end = self.get_labels_start_end_time(recognized)
+ y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)
+
+ tp = 0
+ fp = 0
+
+ hits = np.zeros(len(y_label))
+
+ for j in range(len(p_label)):
+ intersection = np.minimum(p_end[j], y_end) - np.maximum(
+ p_start[j], y_start)
+ union = np.maximum(p_end[j], y_end) - np.minimum(
+ p_start[j], y_start)
+ IoU = (1.0 * intersection / union) * (
+ [p_label[j] == y_label[x] for x in range(len(y_label))])
+ # Get the best scoring segment
+ idx = np.array(IoU).argmax()
+
+ if IoU[idx] >= overlap and not hits[idx]:
+ tp += 1
+ hits[idx] = 1
+ else:
+ fp += 1
+ fn = len(y_label) - sum(hits)
+ return float(tp), float(fp), float(fn)
diff --git a/paddlevideo/modeling/heads/ops.py b/paddlevideo/modeling/heads/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c357fa707836c56fdc51cf008df2ca919e05468
--- /dev/null
+++ b/paddlevideo/modeling/heads/ops.py
@@ -0,0 +1,1583 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from paddle.fluid.framework import Variable, in_dygraph_mode
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph import layers
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+import math
+import six
+import numpy as np
+from functools import reduce
+chaj_debug = 0
+
+__all__ = [
+ 'roi_pool',
+ 'roi_align',
+ 'prior_box',
+ 'generate_proposals',
+ 'iou_similarity',
+ 'box_coder',
+ 'yolo_box',
+ 'multiclass_nms',
+ 'distribute_fpn_proposals',
+ 'collect_fpn_proposals',
+ 'matrix_nms',
+ 'batch_norm',
+ 'mish',
+]
+
+
+def mish(x):
+ return x * paddle.tanh(F.softplus(x))
+
+
+def batch_norm(ch,
+ norm_type='bn',
+ norm_decay=0.,
+ initializer=None,
+ data_format='NCHW'):
+ if norm_type == 'sync_bn':
+ batch_norm = nn.SyncBatchNorm
+ else:
+ batch_norm = nn.BatchNorm2D
+
+ return batch_norm(
+ ch,
+ weight_attr=ParamAttr(
+ initializer=initializer, regularizer=L2Decay(norm_decay)),
+ bias_attr=ParamAttr(regularizer=L2Decay(norm_decay)),
+ data_format=data_format)
+
+
+@paddle.jit.not_to_static
+def roi_pool(input,
+ rois,
+ output_size,
+ spatial_scale=1.0,
+ rois_num=None,
+ name=None):
+ """
+
+ This operator implements the roi_pooling layer.
+ Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
+
+ The operator has three steps:
+
+ 1. Dividing each region proposal into equal-sized sections with output_size(h, w);
+ 2. Finding the largest value in each section;
+ 3. Copying these max values to the output buffer.
+
+ For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
+
+ Args:
+ input (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W],
+ where N is the batch size, C is the input channel, H is Height, W is weight.
+ The data type is float32 or float64.
+ rois (Tensor): ROIs (Regions of Interest) to pool over.
+ 2D-Tensor or 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1.
+ Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates,
+ and (x2, y2) is the bottom right coordinates.
+ output_size (int or tuple[int, int]): The pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
+ spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0
+ rois_num (Tensor): The number of RoIs in each image. Default: None
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+
+ Returns:
+ Tensor: The pooled feature, 4D-Tensor with the shape of [num_rois, C, output_size[0], output_size[1]].
+
+
+ Examples:
+
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ paddle.enable_static()
+
+ x = paddle.static.data(
+ name='data', shape=[None, 256, 32, 32], dtype='float32')
+ rois = paddle.static.data(
+ name='rois', shape=[None, 4], dtype='float32')
+ rois_num = paddle.static.data(name='rois_num', shape=[None], dtype='int32')
+
+ pool_out = ops.roi_pool(
+ input=x,
+ rois=rois,
+ output_size=(1, 1),
+ spatial_scale=1.0,
+ rois_num=rois_num)
+ """
+ check_type(output_size, 'output_size', (int, tuple), 'roi_pool')
+ if isinstance(output_size, int):
+ output_size = (output_size, output_size)
+
+ pooled_height, pooled_width = output_size
+ if in_dygraph_mode():
+ assert rois_num is not None, "rois_num should not be None in dygraph mode."
+ pool_out, argmaxes = core.ops.roi_pool(
+ input, rois, rois_num, "pooled_height", pooled_height,
+ "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+ return pool_out, argmaxes
+
+ else:
+ check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
+ check_variable_and_dtype(rois, 'rois', ['float32'], 'roi_pool')
+ helper = LayerHelper('roi_pool', **locals())
+ dtype = helper.input_dtype()
+ pool_out = helper.create_variable_for_type_inference(dtype)
+ argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+
+ inputs = {
+ "X": input,
+ "ROIs": rois,
+ }
+ if rois_num is not None:
+ inputs['RoisNum'] = rois_num
+ helper.append_op(
+ type="roi_pool",
+ inputs=inputs,
+ outputs={"Out": pool_out,
+ "Argmax": argmaxes},
+ attrs={
+ "pooled_height": pooled_height,
+ "pooled_width": pooled_width,
+ "spatial_scale": spatial_scale
+ })
+ return pool_out, argmaxes
+
+
+@paddle.jit.not_to_static
+def roi_align(input,
+ rois,
+ output_size,
+ spatial_scale=1.0,
+ sampling_ratio=-1,
+ rois_num=None,
+ aligned=True,
+ name=None):
+ """
+
+ Region of interest align (also known as RoI align) is to perform
+ bilinear interpolation on inputs of nonuniform sizes to obtain
+ fixed-size feature maps (e.g. 7*7)
+
+ Dividing each region proposal into equal-sized sections with
+ the pooled_width and pooled_height. Location remains the origin
+ result.
+
+ In each ROI bin, the value of the four regularly sampled locations
+ are computed directly through bilinear interpolation. The output is
+ the mean of four locations.
+ Thus avoid the misaligned problem.
+
+ Args:
+ input (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W],
+ where N is the batch size, C is the input channel, H is Height, W is weight.
+ The data type is float32 or float64.
+ rois (Tensor): ROIs (Regions of Interest) to pool over.It should be
+ a 2-D Tensor or 2-D LoDTensor of shape (num_rois, 4), the lod level is 1.
+ The data type is float32 or float64. Given as [[x1, y1, x2, y2], ...],
+ (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates.
+ output_size (int or tuple[int, int]): The pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
+ spatial_scale (float32, optional): Multiplicative spatial scale factor to translate ROI coords
+ from their input scale to the scale used when pooling. Default: 1.0
+ sampling_ratio(int32, optional): number of sampling points in the interpolation grid.
+ If <=0, then grid points are adaptive to roi_width and pooled_w, likewise for height. Default: -1
+ rois_num (Tensor): The number of RoIs in each image. Default: None
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+ Returns:
+ Tensor:
+
+ Output: The output of ROIAlignOp is a 4-D tensor with shape (num_rois, channels, pooled_h, pooled_w). The data type is float32 or float64.
+
+
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ paddle.enable_static()
+
+ x = paddle.static.data(
+ name='data', shape=[None, 256, 32, 32], dtype='float32')
+ rois = paddle.static.data(
+ name='rois', shape=[None, 4], dtype='float32')
+ rois_num = paddle.static.data(name='rois_num', shape=[None], dtype='int32')
+ align_out = ops.roi_align(input=x,
+ rois=rois,
+ ouput_size=(7, 7),
+ spatial_scale=0.5,
+ sampling_ratio=-1,
+ rois_num=rois_num)
+ """
+ check_type(output_size, 'output_size', (int, tuple), 'roi_align')
+ if isinstance(output_size, int):
+ output_size = (output_size, output_size)
+
+ pooled_height, pooled_width = output_size
+
+ if in_dygraph_mode():
+ assert rois_num is not None, "rois_num should not be None in dygraph mode."
+ if chaj_debug:
+ print("chajchaj, ops.py, bf core.ops.roi_align, type(rois):",type(rois))
+ print("chajchaj, ops.py, bf core.ops.roi_align, rois.shape:",rois.shape)
+ if rois.shape[0]>0:
+ print("chajchaj, ops.py, bf core.ops.roi_align, (rois):",(rois))
+ align_out = core.ops.roi_align(
+ input, rois, rois_num, "pooled_height", pooled_height,
+ "pooled_width", pooled_width, "spatial_scale", spatial_scale,
+ "sampling_ratio", sampling_ratio, "aligned", aligned)
+ return align_out
+
+ else:
+ check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+ 'roi_align')
+ check_variable_and_dtype(rois, 'rois', ['float32', 'float64'],
+ 'roi_align')
+ helper = LayerHelper('roi_align', **locals())
+ dtype = helper.input_dtype()
+ align_out = helper.create_variable_for_type_inference(dtype)
+ inputs = {
+ "X": input,
+ "ROIs": rois,
+ }
+ if rois_num is not None:
+ inputs['RoisNum'] = rois_num
+ helper.append_op(
+ type="roi_align",
+ inputs=inputs,
+ outputs={"Out": align_out},
+ attrs={
+ "pooled_height": pooled_height,
+ "pooled_width": pooled_width,
+ "spatial_scale": spatial_scale,
+ "sampling_ratio": sampling_ratio,
+ "aligned": aligned,
+ })
+ return align_out
+
+
+@paddle.jit.not_to_static
+def iou_similarity(x, y, box_normalized=True, name=None):
+ """
+ Computes intersection-over-union (IOU) between two box lists.
+ Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+ boxes in 'Y' are shared by all instance of the batched inputs of X.
+ Given two boxes A and B, the calculation of IOU is as follows:
+
+ $$
+ IOU(A, B) =
+ \\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
+ $$
+
+ Args:
+ x (Tensor): Box list X is a 2-D Tensor with shape [N, 4] holds N
+ boxes, each box is represented as [xmin, ymin, xmax, ymax],
+ the shape of X is [N, 4]. [xmin, ymin] is the left top
+ coordinate of the box if the input is image feature map, they
+ are close to the origin of the coordinate system.
+ [xmax, ymax] is the right bottom coordinate of the box.
+ The data type is float32 or float64.
+ y (Tensor): Box list Y holds M boxes, each box is represented as
+ [xmin, ymin, xmax, ymax], the shape of X is [N, 4].
+ [xmin, ymin] is the left top coordinate of the box if the
+ input is image feature map, and [xmax, ymax] is the right
+ bottom coordinate of the box. The data type is float32 or float64.
+ box_normalized(bool): Whether treat the priorbox as a normalized box.
+ Set true by default.
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+ Returns:
+ Tensor: The output of iou_similarity op, a tensor with shape [N, M]
+ representing pairwise iou scores. The data type is same with x.
+
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ paddle.enable_static()
+
+ x = paddle.static.data(name='x', shape=[None, 4], dtype='float32')
+ y = paddle.static.data(name='y', shape=[None, 4], dtype='float32')
+ iou = ops.iou_similarity(x=x, y=y)
+ """
+
+ if in_dygraph_mode():
+ out = core.ops.iou_similarity(x, y, 'box_normalized', box_normalized)
+ return out
+ else:
+ helper = LayerHelper("iou_similarity", **locals())
+ out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+ helper.append_op(
+ type="iou_similarity",
+ inputs={"X": x,
+ "Y": y},
+ attrs={"box_normalized": box_normalized},
+ outputs={"Out": out})
+ return out
+
+
+@paddle.jit.not_to_static
+def collect_fpn_proposals(multi_rois,
+ multi_scores,
+ min_level,
+ max_level,
+ post_nms_top_n,
+ rois_num_per_level=None,
+ name=None):
+ """
+
+ **This OP only supports LoDTensor as input**. Concat multi-level RoIs
+ (Region of Interest) and select N RoIs with respect to multi_scores.
+ This operation performs the following steps:
+
+ 1. Choose num_level RoIs and scores as input: num_level = max_level - min_level
+ 2. Concat multi-level RoIs and scores
+ 3. Sort scores and select post_nms_top_n scores
+ 4. Gather RoIs by selected indices from scores
+ 5. Re-sort RoIs by corresponding batch_id
+
+ Args:
+ multi_rois(list): List of RoIs to collect. Element in list is 2-D
+ LoDTensor with shape [N, 4] and data type is float32 or float64,
+ N is the number of RoIs.
+ multi_scores(list): List of scores of RoIs to collect. Element in list
+ is 2-D LoDTensor with shape [N, 1] and data type is float32 or
+ float64, N is the number of RoIs.
+ min_level(int): The lowest level of FPN layer to collect
+ max_level(int): The highest level of FPN layer to collect
+ post_nms_top_n(int): The number of selected RoIs
+ rois_num_per_level(list, optional): The List of RoIs' numbers.
+ Each element is 1-D Tensor which contains the RoIs' number of each
+ image on each level and the shape is [B] and data type is
+ int32, B is the number of images. If it is not None then return
+ a 1-D Tensor contains the output RoIs' number of each image and
+ the shape is [B]. Default: None
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+ Returns:
+ Variable:
+
+ fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is
+ float32 or float64. Selected RoIs.
+
+ rois_num(Tensor): 1-D Tensor contains the RoIs's number of each
+ image. The shape is [B] and data type is int32. B is the number of
+ images.
+
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ paddle.enable_static()
+ multi_rois = []
+ multi_scores = []
+ for i in range(4):
+ multi_rois.append(paddle.static.data(
+ name='roi_'+str(i), shape=[None, 4], dtype='float32', lod_level=1))
+ for i in range(4):
+ multi_scores.append(paddle.static.data(
+ name='score_'+str(i), shape=[None, 1], dtype='float32', lod_level=1))
+
+ fpn_rois = ops.collect_fpn_proposals(
+ multi_rois=multi_rois,
+ multi_scores=multi_scores,
+ min_level=2,
+ max_level=5,
+ post_nms_top_n=2000)
+ """
+ check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals')
+ check_type(multi_scores, 'multi_scores', list, 'collect_fpn_proposals')
+ num_lvl = max_level - min_level + 1
+ input_rois = multi_rois[:num_lvl]
+ input_scores = multi_scores[:num_lvl]
+
+ if in_dygraph_mode():
+ assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode."
+ attrs = ('post_nms_topN', post_nms_top_n)
+ output_rois, rois_num = core.ops.collect_fpn_proposals(
+ input_rois, input_scores, rois_num_per_level, *attrs)
+ return output_rois, rois_num
+
+ else:
+ helper = LayerHelper('collect_fpn_proposals', **locals())
+ dtype = helper.input_dtype('multi_rois')
+ check_dtype(dtype, 'multi_rois', ['float32', 'float64'],
+ 'collect_fpn_proposals')
+ output_rois = helper.create_variable_for_type_inference(dtype)
+ output_rois.stop_gradient = True
+
+ inputs = {
+ 'MultiLevelRois': input_rois,
+ 'MultiLevelScores': input_scores,
+ }
+ outputs = {'FpnRois': output_rois}
+ if rois_num_per_level is not None:
+ inputs['MultiLevelRoIsNum'] = rois_num_per_level
+ rois_num = helper.create_variable_for_type_inference(dtype='int32')
+ rois_num.stop_gradient = True
+ outputs['RoisNum'] = rois_num
+ helper.append_op(
+ type='collect_fpn_proposals',
+ inputs=inputs,
+ outputs=outputs,
+ attrs={'post_nms_topN': post_nms_top_n})
+ return output_rois, rois_num
+
+
+@paddle.jit.not_to_static
+def distribute_fpn_proposals(fpn_rois,
+ min_level,
+ max_level,
+ refer_level,
+ refer_scale,
+ pixel_offset=False,
+ rois_num=None,
+ name=None):
+ """
+
+ **This op only takes LoDTensor as input.** In Feature Pyramid Networks
+ (FPN) models, it is needed to distribute all proposals into different FPN
+ level, with respect to scale of the proposals, the referring scale and the
+ referring level. Besides, to restore the order of proposals, we return an
+ array which indicates the original index of rois in current proposals.
+ To compute FPN level for each roi, the formula is given as follows:
+
+ .. math::
+
+ roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
+
+ level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+
+ where BBoxArea is a function to compute the area of each roi.
+
+ Args:
+
+ fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is
+ float32 or float64. The input fpn_rois.
+ min_level(int32): The lowest level of FPN layer where the proposals come
+ from.
+ max_level(int32): The highest level of FPN layer where the proposals
+ come from.
+ refer_level(int32): The referring level of FPN layer with specified scale.
+ refer_scale(int32): The referring scale of FPN layer with specified level.
+ rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image.
+ The shape is [B] and data type is int32. B is the number of images.
+ If it is not None then return a list of 1-D Tensor. Each element
+ is the output RoIs' number of each image on the corresponding level
+ and the shape is [B]. None by default.
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+ Returns:
+ Tuple:
+
+ multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4]
+ and data type of float32 and float64. The length is
+ max_level-min_level+1. The proposals in each FPN level.
+
+ restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is
+ the number of total rois. The data type is int32. It is
+ used to restore the order of fpn_rois.
+
+ rois_num_per_level(List): A list of 1-D Tensor and each Tensor is
+ the RoIs' number in each image on the corresponding level. The shape
+ is [B] and data type of int32. B is the number of images
+
+
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ paddle.enable_static()
+ fpn_rois = paddle.static.data(
+ name='data', shape=[None, 4], dtype='float32', lod_level=1)
+ multi_rois, restore_ind = ops.distribute_fpn_proposals(
+ fpn_rois=fpn_rois,
+ min_level=2,
+ max_level=5,
+ refer_level=4,
+ refer_scale=224)
+ """
+ num_lvl = max_level - min_level + 1
+
+ if in_dygraph_mode():
+ assert rois_num is not None, "rois_num should not be None in dygraph mode."
+ attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
+ refer_level, 'refer_scale', refer_scale, 'pixel_offset',
+ pixel_offset)
+ multi_rois, restore_ind, rois_num_per_level = core.ops.distribute_fpn_proposals(
+ fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+ return multi_rois, restore_ind, rois_num_per_level
+
+ else:
+ check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
+ 'distribute_fpn_proposals')
+ helper = LayerHelper('distribute_fpn_proposals', **locals())
+ dtype = helper.input_dtype('fpn_rois')
+ multi_rois = [
+ helper.create_variable_for_type_inference(dtype)
+ for i in range(num_lvl)
+ ]
+
+ restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+
+ inputs = {'FpnRois': fpn_rois}
+ outputs = {
+ 'MultiFpnRois': multi_rois,
+ 'RestoreIndex': restore_ind,
+ }
+
+ if rois_num is not None:
+ inputs['RoisNum'] = rois_num
+ rois_num_per_level = [
+ helper.create_variable_for_type_inference(dtype='int32')
+ for i in range(num_lvl)
+ ]
+ outputs['MultiLevelRoIsNum'] = rois_num_per_level
+
+ helper.append_op(
+ type='distribute_fpn_proposals',
+ inputs=inputs,
+ outputs=outputs,
+ attrs={
+ 'min_level': min_level,
+ 'max_level': max_level,
+ 'refer_level': refer_level,
+ 'refer_scale': refer_scale,
+ 'pixel_offset': pixel_offset
+ })
+ return multi_rois, restore_ind, rois_num_per_level
+
+
+@paddle.jit.not_to_static
+def yolo_box(
+ x,
+ origin_shape,
+ anchors,
+ class_num,
+ conf_thresh,
+ downsample_ratio,
+ clip_bbox=True,
+ scale_x_y=1.,
+ name=None, ):
+ """
+
+ This operator generates YOLO detection boxes from output of YOLOv3 network.
+
+ The output of previous network is in shape [N, C, H, W], while H and W
+ should be the same, H and W specify the grid size, each grid point predict
+ given number boxes, this given number, which following will be represented as S,
+ is specified by the number of anchors. In the second dimension(the channel
+ dimension), C should be equal to S * (5 + class_num), class_num is the object
+ category number of source dataset(such as 80 in coco dataset), so the
+ second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
+ also includes confidence score of the box and class one-hot key of each anchor
+ box.
+ Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box
+ predictions should be as follows:
+ $$
+ b_x = \\sigma(t_x) + c_x
+ $$
+ $$
+ b_y = \\sigma(t_y) + c_y
+ $$
+ $$
+ b_w = p_w e^{t_w}
+ $$
+ $$
+ b_h = p_h e^{t_h}
+ $$
+ in the equation above, :math:`c_x, c_y` is the left top corner of current grid
+ and :math:`p_w, p_h` is specified by anchors.
+ The logistic regression value of the 5th channel of each anchor prediction boxes
+ represents the confidence score of each prediction box, and the logistic
+ regression value of the last :attr:`class_num` channels of each anchor prediction
+ boxes represents the classifcation scores. Boxes with confidence scores less than
+ :attr:`conf_thresh` should be ignored, and box final scores is the product of
+ confidence scores and classification scores.
+ $$
+ score_{pred} = score_{conf} * score_{class}
+ $$
+
+ Args:
+ x (Tensor): The input tensor of YoloBox operator is a 4-D tensor with shape of [N, C, H, W].
+ The second dimension(C) stores box locations, confidence score and
+ classification one-hot keys of each anchor box. Generally, X should be the output of YOLOv3 network.
+ The data type is float32 or float64.
+ origin_shape (Tensor): The image size tensor of YoloBox operator, This is a 2-D tensor with shape of [N, 2].
+ This tensor holds height and width of each input image used for resizing output box in input image
+ scale. The data type is int32.
+ anchors (list|tuple): The anchor width and height, it will be parsed pair by pair.
+ class_num (int): The number of classes to predict.
+ conf_thresh (float): The confidence scores threshold of detection boxes. Boxes with confidence scores
+ under threshold should be ignored.
+ downsample_ratio (int): The downsample ratio from network input to YoloBox operator input,
+ so 32, 16, 8 should be set for the first, second, and thrid YoloBox operators.
+ clip_bbox (bool): Whether clip output bonding box in Input(ImgSize) boundary. Default true.
+ scale_x_y (float): Scale the center point of decoded bounding box. Default 1.0.
+ name (string): The default value is None. Normally there is no need
+ for user to set this property. For more information,
+ please refer to :ref:`api_guide_Name`
+
+ Returns:
+ boxes Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes, N is the batch num,
+ M is output box number, and the 3rd dimension stores [xmin, ymin, xmax, ymax] coordinates of boxes.
+ scores Tensor: A 3-D tensor with shape [N, M, :attr:`class_num`], the coordinates of boxes, N is the batch num,
+ M is output box number.
+
+ Raises:
+ TypeError: Attr anchors of yolo box must be list or tuple
+ TypeError: Attr class_num of yolo box must be an integer
+ TypeError: Attr conf_thresh of yolo box must be a float number
+
+ Examples:
+
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+
+ paddle.enable_static()
+ x = paddle.static.data(name='x', shape=[None, 255, 13, 13], dtype='float32')
+ img_size = paddle.static.data(name='img_size',shape=[None, 2],dtype='int64')
+ anchors = [10, 13, 16, 30, 33, 23]
+ boxes,scores = ops.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors,
+ conf_thresh=0.01, downsample_ratio=32)
+ """
+ helper = LayerHelper('yolo_box', **locals())
+
+ if not isinstance(anchors, list) and not isinstance(anchors, tuple):
+ raise TypeError("Attr anchors of yolo_box must be list or tuple")
+ if not isinstance(class_num, int):
+ raise TypeError("Attr class_num of yolo_box must be an integer")
+ if not isinstance(conf_thresh, float):
+ raise TypeError("Attr ignore_thresh of yolo_box must be a float number")
+
+ if in_dygraph_mode():
+ attrs = ('anchors', anchors, 'class_num', class_num, 'conf_thresh',
+ conf_thresh, 'downsample_ratio', downsample_ratio, 'clip_bbox',
+ clip_bbox, 'scale_x_y', scale_x_y)
+ boxes, scores = core.ops.yolo_box(x, origin_shape, *attrs)
+ return boxes, scores
+ else:
+ boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
+ scores = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+ attrs = {
+ "anchors": anchors,
+ "class_num": class_num,
+ "conf_thresh": conf_thresh,
+ "downsample_ratio": downsample_ratio,
+ "clip_bbox": clip_bbox,
+ "scale_x_y": scale_x_y,
+ }
+
+ helper.append_op(
+ type='yolo_box',
+ inputs={
+ "X": x,
+ "ImgSize": origin_shape,
+ },
+ outputs={
+ 'Boxes': boxes,
+ 'Scores': scores,
+ },
+ attrs=attrs)
+ return boxes, scores
+
+
+@paddle.jit.not_to_static
+def prior_box(input,
+ image,
+ min_sizes,
+ max_sizes=None,
+ aspect_ratios=[1.],
+ variance=[0.1, 0.1, 0.2, 0.2],
+ flip=False,
+ clip=False,
+ steps=[0.0, 0.0],
+ offset=0.5,
+ min_max_aspect_ratios_order=False,
+ name=None):
+ """
+
+ This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+ Each position of the input produce N prior boxes, N is determined by
+ the count of min_sizes, max_sizes and aspect_ratios, The size of the
+ box is in range(min_size, max_size) interval, which is generated in
+ sequence according to the aspect_ratios.
+
+ Parameters:
+ input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.
+ image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,
+ the data type should be float32 or float64.
+ min_sizes(list|tuple|float): the min sizes of generated prior boxes.
+ max_sizes(list|tuple|None): the max sizes of generated prior boxes.
+ Default: None.
+ aspect_ratios(list|tuple|float): the aspect ratios of generated
+ prior boxes. Default: [1.].
+ variance(list|tuple): the variances to be encoded in prior boxes.
+ Default:[0.1, 0.1, 0.2, 0.2].
+ flip(bool): Whether to flip aspect ratios. Default:False.
+ clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+ step(list|tuple): Prior boxes step across width and height, If
+ step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across
+ height or weight of the input will be automatically calculated.
+ Default: [0., 0.]
+ offset(float): Prior boxes center offset. Default: 0.5
+ min_max_aspect_ratios_order(bool): If set True, the output prior box is
+ in order of [min, max, aspect_ratios], which is consistent with
+ Caffe. Please note, this order affects the weights order of
+ convolution layer followed by and does not affect the final
+ detection results. Default: False.
+ name(str, optional): The default value is None. Normally there is no need for
+ user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+ Returns:
+ Tuple: A tuple with two Variable (boxes, variances)
+
+ boxes(Tensor): the output prior boxes of PriorBox.
+ 4-D tensor, the layout is [H, W, num_priors, 4].
+ H is the height of input, W is the width of input,
+ num_priors is the total box count of each position of input.
+
+ variances(Tensor): the expanded variances of PriorBox.
+ 4-D tensor, the layput is [H, W, num_priors, 4].
+ H is the height of input, W is the width of input
+ num_priors is the total box count of each position of input
+
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+
+ paddle.enable_static()
+ input = paddle.static.data(name="input", shape=[None,3,6,9])
+ image = paddle.static.data(name="image", shape=[None,3,9,12])
+ box, var = ops.prior_box(
+ input=input,
+ image=image,
+ min_sizes=[100.],
+ clip=True,
+ flip=True)
+ """
+ helper = LayerHelper("prior_box", **locals())
+ dtype = helper.input_dtype()
+ check_variable_and_dtype(
+ input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box')
+
+ def _is_list_or_tuple_(data):
+ return (isinstance(data, list) or isinstance(data, tuple))
+
+ if not _is_list_or_tuple_(min_sizes):
+ min_sizes = [min_sizes]
+ if not _is_list_or_tuple_(aspect_ratios):
+ aspect_ratios = [aspect_ratios]
+ if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+ raise ValueError('steps should be a list or tuple ',
+ 'with length 2, (step_width, step_height).')
+
+ min_sizes = list(map(float, min_sizes))
+ aspect_ratios = list(map(float, aspect_ratios))
+ steps = list(map(float, steps))
+
+ cur_max_sizes = None
+ if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+ if not _is_list_or_tuple_(max_sizes):
+ max_sizes = [max_sizes]
+ cur_max_sizes = max_sizes
+
+ if in_dygraph_mode():
+ attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios,
+ 'variances', variance, 'flip', flip, 'clip', clip, 'step_w',
+ steps[0], 'step_h', steps[1], 'offset', offset,
+ 'min_max_aspect_ratios_order', min_max_aspect_ratios_order)
+ if cur_max_sizes is not None:
+ attrs += ('max_sizes', cur_max_sizes)
+ box, var = core.ops.prior_box(input, image, *attrs)
+ return box, var
+ else:
+ attrs = {
+ 'min_sizes': min_sizes,
+ 'aspect_ratios': aspect_ratios,
+ 'variances': variance,
+ 'flip': flip,
+ 'clip': clip,
+ 'step_w': steps[0],
+ 'step_h': steps[1],
+ 'offset': offset,
+ 'min_max_aspect_ratios_order': min_max_aspect_ratios_order
+ }
+
+ if cur_max_sizes is not None:
+ attrs['max_sizes'] = cur_max_sizes
+
+ box = helper.create_variable_for_type_inference(dtype)
+ var = helper.create_variable_for_type_inference(dtype)
+ helper.append_op(
+ type="prior_box",
+ inputs={"Input": input,
+ "Image": image},
+ outputs={"Boxes": box,
+ "Variances": var},
+ attrs=attrs, )
+ box.stop_gradient = True
+ var.stop_gradient = True
+ return box, var
+
+
+@paddle.jit.not_to_static
+def multiclass_nms(bboxes,
+ scores,
+ score_threshold,
+ nms_top_k,
+ keep_top_k,
+ nms_threshold=0.3,
+ normalized=True,
+ nms_eta=1.,
+ background_label=-1,
+ return_index=False,
+ return_rois_num=True,
+ rois_num=None,
+ name=None):
+ """
+ This operator is to do multi-class non maximum suppression (NMS) on
+ boxes and scores.
+ In the NMS step, this operator greedily selects a subset of detection bounding
+ boxes that have high scores larger than score_threshold, if providing this
+ threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+ is larger than -1. Then this operator pruns away boxes that have high IOU
+ (intersection over union) overlap with already selected boxes by adaptive
+ threshold NMS based on parameters of nms_threshold and nms_eta.
+ Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+ per image if keep_top_k is larger than -1.
+ Args:
+ bboxes (Tensor): Two types of bboxes are supported:
+ 1. (Tensor) A 3-D Tensor with shape
+ [N, M, 4 or 8 16 24 32] represents the
+ predicted locations of M bounding bboxes,
+ N is the batch size. Each bounding box has four
+ coordinate values and the layout is
+ [xmin, ymin, xmax, ymax], when box size equals to 4.
+ 2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
+ M is the number of bounding boxes, C is the
+ class number
+ scores (Tensor): Two types of scores are supported:
+ 1. (Tensor) A 3-D Tensor with shape [N, C, M]
+ represents the predicted confidence predictions.
+ N is the batch size, C is the class number, M is
+ number of bounding boxes. For each category there
+ are total M scores which corresponding M bounding
+ boxes. Please note, M is equal to the 2nd dimension
+ of BBoxes.
+ 2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
+ M is the number of bbox, C is the class number.
+ In this case, input BBoxes should be the second
+ case with shape [M, C, 4].
+ background_label (int): The index of background label, the background
+ label will be ignored. If set to -1, then all
+ categories will be considered. Default: 0
+ score_threshold (float): Threshold to filter out bounding boxes with
+ low confidence score. If not provided,
+ consider all boxes.
+ nms_top_k (int): Maximum number of detections to be kept according to
+ the confidences after the filtering detections based
+ on score_threshold.
+ nms_threshold (float): The threshold to be used in NMS. Default: 0.3
+ nms_eta (float): The threshold to be used in NMS. Default: 1.0
+ keep_top_k (int): Number of total bboxes to be kept per image after NMS
+ step. -1 means keeping all bboxes after NMS step.
+ normalized (bool): Whether detections are normalized. Default: True
+ return_index(bool): Whether return selected index. Default: False
+ rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image.
+ The shape is [B] and data type is int32. B is the number of images.
+ If it is not None then return a list of 1-D Tensor. Each element
+ is the output RoIs' number of each image on the corresponding level
+ and the shape is [B]. None by default.
+ name(str): Name of the multiclass nms op. Default: None.
+ Returns:
+ A tuple with two Variables: (Out, Index) if return_index is True,
+ otherwise, a tuple with one Variable(Out) is returned.
+ Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
+ Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+ or A 2-D LoDTensor with shape [No, 10] represents the detections.
+ Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3,
+ x4, y4]. No is the total number of detections.
+ If all images have not detected results, all elements in LoD will be
+ 0, and output tensor is empty (None).
+ Index: Only return when return_index is True. A 2-D LoDTensor with
+ shape [No, 1] represents the selected index which type is Integer.
+ The index is the absolute value cross batches. No is the same number
+ as Out. If the index is used to gather other attribute such as age,
+ one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
+ N is the batch size and M is the number of boxes.
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ boxes = paddle.static.data(name='bboxes', shape=[81, 4],
+ dtype='float32', lod_level=1)
+ scores = paddle.static.data(name='scores', shape=[81],
+ dtype='float32', lod_level=1)
+ out, index = ops.multiclass_nms(bboxes=boxes,
+ scores=scores,
+ background_label=0,
+ score_threshold=0.5,
+ nms_top_k=400,
+ nms_threshold=0.3,
+ keep_top_k=200,
+ normalized=False,
+ return_index=True)
+ """
+ helper = LayerHelper('multiclass_nms3', **locals())
+
+ if in_dygraph_mode():
+ attrs = ('background_label', background_label, 'score_threshold',
+ score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
+ nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
+ 'normalized', normalized)
+ output, index, nms_rois_num = core.ops.multiclass_nms3(bboxes, scores,
+ rois_num, *attrs)
+ if return_index:
+ index = None
+ return output, nms_rois_num, index
+
+ else:
+ output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+ index = helper.create_variable_for_type_inference(dtype='int')
+
+ inputs = {'BBoxes': bboxes, 'Scores': scores}
+ outputs = {'Out': output, 'Index': index}
+
+ if rois_num is not None:
+ inputs['RoisNum'] = rois_num
+
+ if return_rois_num:
+ nms_rois_num = helper.create_variable_for_type_inference(
+ dtype='int32')
+ outputs['NmsRoisNum'] = nms_rois_num
+
+ helper.append_op(
+ type="multiclass_nms3",
+ inputs=inputs,
+ attrs={
+ 'background_label': background_label,
+ 'score_threshold': score_threshold,
+ 'nms_top_k': nms_top_k,
+ 'nms_threshold': nms_threshold,
+ 'keep_top_k': keep_top_k,
+ 'nms_eta': nms_eta,
+ 'normalized': normalized
+ },
+ outputs=outputs)
+ output.stop_gradient = True
+ index.stop_gradient = True
+ if not return_index:
+ index = None
+ if not return_rois_num:
+ nms_rois_num = None
+
+ return output, nms_rois_num, index
+
+
+@paddle.jit.not_to_static
+def matrix_nms(bboxes,
+ scores,
+ score_threshold,
+ post_threshold,
+ nms_top_k,
+ keep_top_k,
+ use_gaussian=False,
+ gaussian_sigma=2.,
+ background_label=0,
+ normalized=True,
+ return_index=False,
+ return_rois_num=True,
+ name=None):
+ """
+ **Matrix NMS**
+ This operator does matrix non maximum suppression (NMS).
+ First selects a subset of candidate bounding boxes that have higher scores
+ than score_threshold (if provided), then the top k candidate is selected if
+ nms_top_k is larger than -1. Score of the remaining candidate are then
+ decayed according to the Matrix NMS scheme.
+ Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+ per image if keep_top_k is larger than -1.
+ Args:
+ bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
+ predicted locations of M bounding bboxes,
+ N is the batch size. Each bounding box has four
+ coordinate values and the layout is
+ [xmin, ymin, xmax, ymax], when box size equals to 4.
+ The data type is float32 or float64.
+ scores (Tensor): A 3-D Tensor with shape [N, C, M]
+ represents the predicted confidence predictions.
+ N is the batch size, C is the class number, M is
+ number of bounding boxes. For each category there
+ are total M scores which corresponding M bounding
+ boxes. Please note, M is equal to the 2nd dimension
+ of BBoxes. The data type is float32 or float64.
+ score_threshold (float): Threshold to filter out bounding boxes with
+ low confidence score.
+ post_threshold (float): Threshold to filter out bounding boxes with
+ low confidence score AFTER decaying.
+ nms_top_k (int): Maximum number of detections to be kept according to
+ the confidences after the filtering detections based
+ on score_threshold.
+ keep_top_k (int): Number of total bboxes to be kept per image after NMS
+ step. -1 means keeping all bboxes after NMS step.
+ use_gaussian (bool): Use Gaussian as the decay function. Default: False
+ gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+ background_label (int): The index of background label, the background
+ label will be ignored. If set to -1, then all
+ categories will be considered. Default: 0
+ normalized (bool): Whether detections are normalized. Default: True
+ return_index(bool): Whether return selected index. Default: False
+ return_rois_num(bool): whether return rois_num. Default: True
+ name(str): Name of the matrix nms op. Default: None.
+ Returns:
+ A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
+ otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+ Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
+ detection results.
+ Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+ (After version 1.3, when no boxes detected, the lod is changed
+ from {0} to {1})
+ Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
+ selected indices, which are absolute values cross batches.
+ rois_num (Tensor): A 1-D Tensor with shape [N] containing
+ the number of detected boxes in each image.
+ Examples:
+ .. code-block:: python
+ import paddle
+ from ppdet.modeling import ops
+ boxes = paddle.static.data(name='bboxes', shape=[None,81, 4],
+ dtype='float32', lod_level=1)
+ scores = paddle.static.data(name='scores', shape=[None,81],
+ dtype='float32', lod_level=1)
+ out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0,
+ score_threshold=0.5, post_threshold=0.1,
+ nms_top_k=400, keep_top_k=200, normalized=False)
+ """
+ check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
+ 'matrix_nms')
+ check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
+ 'matrix_nms')
+ check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
+ check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
+ check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
+ check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')
+ check_type(normalized, 'normalized', bool, 'matrix_nms')
+ check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')
+ check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')
+ check_type(background_label, 'background_label', int, 'matrix_nms')
+
+ if in_dygraph_mode():
+ attrs = ('background_label', background_label, 'score_threshold',
+ score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
+ nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
+ use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
+ normalized)
+ out, index, rois_num = core.ops.matrix_nms(bboxes, scores, *attrs)
+ if not return_index:
+ index = None
+ if not return_rois_num:
+ rois_num = None
+ return out, rois_num, index
+ else:
+ helper = LayerHelper('matrix_nms', **locals())
+ output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+ index = helper.create_variable_for_type_inference(dtype='int')
+ outputs = {'Out': output, 'Index': index}
+ if return_rois_num:
+ rois_num = helper.create_variable_for_type_inference(dtype='int')
+ outputs['RoisNum'] = rois_num
+
+ helper.append_op(
+ type="matrix_nms",
+ inputs={'BBoxes': bboxes,
+ 'Scores': scores},
+ attrs={
+ 'background_label': background_label,
+ 'score_threshold': score_threshold,
+ 'post_threshold': post_threshold,
+ 'nms_top_k': nms_top_k,
+ 'gaussian_sigma': gaussian_sigma,
+ 'use_gaussian': use_gaussian,
+ 'keep_top_k': keep_top_k,
+ 'normalized': normalized
+ },
+ outputs=outputs)
+ output.stop_gradient = True
+
+ if not return_index:
+ index = None
+ if not return_rois_num:
+ rois_num = None
+ return output, rois_num, index
+
+
+def bipartite_match(dist_matrix,
+ match_type=None,
+ dist_threshold=None,
+ name=None):
+ """
+
+ This operator implements a greedy bipartite matching algorithm, which is
+ used to obtain the matching with the maximum distance based on the input
+ distance matrix. For input 2D matrix, the bipartite matching algorithm can
+ find the matched column for each row (matched means the largest distance),
+ also can find the matched row for each column. And this operator only
+ calculate matched indices from column to row. For each instance,
+ the number of matched indices is the column number of the input distance
+ matrix. **The OP only supports CPU**.
+
+ There are two outputs, matched indices and distance.
+ A simple description, this algorithm matched the best (maximum distance)
+ row entity to the column entity and the matched indices are not duplicated
+ in each row of ColToRowMatchIndices. If the column entity is not matched
+ any row entity, set -1 in ColToRowMatchIndices.
+
+ NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
+ If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+ If Tensor, the height of ColToRowMatchIndices is 1.
+
+ NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
+ layer. Please consider to use :code:`ssd_loss` instead.
+
+ Args:
+ dist_matrix(Tensor): This input is a 2-D LoDTensor with shape
+ [K, M]. The data type is float32 or float64. It is pair-wise
+ distance matrix between the entities represented by each row and
+ each column. For example, assumed one entity is A with shape [K],
+ another entity is B with shape [M]. The dist_matrix[i][j] is the
+ distance between A[i] and B[j]. The bigger the distance is, the
+ better matching the pairs are. NOTE: This tensor can contain LoD
+ information to represent a batch of inputs. One instance of this
+ batch can contain different numbers of entities.
+ match_type(str, optional): The type of matching method, should be
+ 'bipartite' or 'per_prediction'. None ('bipartite') by default.
+ dist_threshold(float32, optional): If `match_type` is 'per_prediction',
+ this threshold is to determine the extra matching bboxes based
+ on the maximum distance, 0.5 by default.
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+ Returns:
+ Tuple:
+
+ matched_indices(Tensor): A 2-D Tensor with shape [N, M]. The data
+ type is int32. N is the batch size. If match_indices[i][j] is -1, it
+ means B[j] does not match any entity in i-th instance.
+ Otherwise, it means B[j] is matched to row
+ match_indices[i][j] in i-th instance. The row number of
+ i-th instance is saved in match_indices[i][j].
+
+ matched_distance(Tensor): A 2-D Tensor with shape [N, M]. The data
+ type is float32. N is batch size. If match_indices[i][j] is -1,
+ match_distance[i][j] is also -1.0. Otherwise, assumed
+ match_distance[i][j] = d, and the row offsets of each instance
+ are called LoD. Then match_distance[i][j] =
+ dist_matrix[d+LoD[i]][j].
+
+ Examples:
+
+ .. code-block:: python
+ import paddle
+ from ppdet.modeling import ops
+ from ppdet.modeling.utils import iou_similarity
+
+ paddle.enable_static()
+
+ x = paddle.static.data(name='x', shape=[None, 4], dtype='float32')
+ y = paddle.static.data(name='y', shape=[None, 4], dtype='float32')
+ iou = iou_similarity(x=x, y=y)
+ matched_indices, matched_dist = ops.bipartite_match(iou)
+ """
+ check_variable_and_dtype(dist_matrix, 'dist_matrix',
+ ['float32', 'float64'], 'bipartite_match')
+
+ if in_dygraph_mode():
+ match_indices, match_distance = core.ops.bipartite_match(
+ dist_matrix, "match_type", match_type, "dist_threshold",
+ dist_threshold)
+ return match_indices, match_distance
+
+ helper = LayerHelper('bipartite_match', **locals())
+ match_indices = helper.create_variable_for_type_inference(dtype='int32')
+ match_distance = helper.create_variable_for_type_inference(
+ dtype=dist_matrix.dtype)
+ helper.append_op(
+ type='bipartite_match',
+ inputs={'DistMat': dist_matrix},
+ attrs={
+ 'match_type': match_type,
+ 'dist_threshold': dist_threshold,
+ },
+ outputs={
+ 'ColToRowMatchIndices': match_indices,
+ 'ColToRowMatchDist': match_distance
+ })
+ return match_indices, match_distance
+
+
+@paddle.jit.not_to_static
+def box_coder(prior_box,
+ prior_box_var,
+ target_box,
+ code_type="encode_center_size",
+ box_normalized=True,
+ axis=0,
+ name=None):
+ """
+ **Box Coder Layer**
+ Encode/Decode the target bounding box with the priorbox information.
+
+ The Encoding schema described below:
+ .. math::
+ ox = (tx - px) / pw / pxv
+ oy = (ty - py) / ph / pyv
+ ow = \log(\abs(tw / pw)) / pwv
+ oh = \log(\abs(th / ph)) / phv
+ The Decoding schema described below:
+
+ .. math::
+
+ ox = (pw * pxv * tx * + px) - tw / 2
+ oy = (ph * pyv * ty * + py) - th / 2
+ ow = \exp(pwv * tw) * pw + tw / 2
+ oh = \exp(phv * th) * ph + th / 2
+ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates,
+ width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote
+ the priorbox's (anchor) center coordinates, width and height. `pxv`,
+ `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`,
+ `ow`, `oh` denote the encoded/decoded coordinates, width and height.
+ During Box Decoding, two modes for broadcast are supported. Say target
+ box has shape [N, M, 4], and the shape of prior box can be [N, 4] or
+ [M, 4]. Then prior box will broadcast to target box along the
+ assigned axis.
+
+ Args:
+ prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape
+ [M, 4] holds M boxes and data type is float32 or float64. Each box
+ is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the
+ left top coordinate of the anchor box, if the input is image feature
+ map, they are close to the origin of the coordinate system.
+ [xmax, ymax] is the right bottom coordinate of the anchor box.
+ prior_box_var(List|Tensor|None): prior_box_var supports three types
+ of input. One is Tensor with shape [M, 4] which holds M group and
+ data type is float32 or float64. The second is list consist of
+ 4 elements shared by all boxes and data type is float32 or float64.
+ Other is None and not involved in calculation.
+ target_box(Tensor): This input can be a 2-D LoDTensor with shape
+ [N, 4] when code_type is 'encode_center_size'. This input also can
+ be a 3-D Tensor with shape [N, M, 4] when code_type is
+ 'decode_center_size'. Each box is represented as
+ [xmin, ymin, xmax, ymax]. The data type is float32 or float64.
+ code_type(str): The code type used with the target box. It can be
+ `encode_center_size` or `decode_center_size`. `encode_center_size`
+ by default.
+ box_normalized(bool): Whether treat the priorbox as a normalized box.
+ Set true by default.
+ axis(int): Which axis in PriorBox to broadcast for box decode,
+ for example, if axis is 0 and TargetBox has shape [N, M, 4] and
+ PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
+ for decoding. It is only valid when code type is
+ `decode_center_size`. Set 0 by default.
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+ Returns:
+ Tensor:
+ output_box(Tensor): When code_type is 'encode_center_size', the
+ output tensor of box_coder_op with shape [N, M, 4] representing the
+ result of N target boxes encoded with M Prior boxes and variances.
+ When code_type is 'decode_center_size', N represents the batch size
+ and M represents the number of decoded boxes.
+
+ Examples:
+
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ paddle.enable_static()
+ # For encode
+ prior_box_encode = paddle.static.data(name='prior_box_encode',
+ shape=[512, 4],
+ dtype='float32')
+ target_box_encode = paddle.static.data(name='target_box_encode',
+ shape=[81, 4],
+ dtype='float32')
+ output_encode = ops.box_coder(prior_box=prior_box_encode,
+ prior_box_var=[0.1,0.1,0.2,0.2],
+ target_box=target_box_encode,
+ code_type="encode_center_size")
+ # For decode
+ prior_box_decode = paddle.static.data(name='prior_box_decode',
+ shape=[512, 4],
+ dtype='float32')
+ target_box_decode = paddle.static.data(name='target_box_decode',
+ shape=[512, 81, 4],
+ dtype='float32')
+ output_decode = ops.box_coder(prior_box=prior_box_decode,
+ prior_box_var=[0.1,0.1,0.2,0.2],
+ target_box=target_box_decode,
+ code_type="decode_center_size",
+ box_normalized=False,
+ axis=1)
+ """
+ check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'],
+ 'box_coder')
+ check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'],
+ 'box_coder')
+
+ if in_dygraph_mode():
+ if isinstance(prior_box_var, Variable):
+ output_box = core.ops.box_coder(
+ prior_box, prior_box_var, target_box, "code_type", code_type,
+ "box_normalized", box_normalized, "axis", axis)
+
+ elif isinstance(prior_box_var, list):
+ output_box = core.ops.box_coder(
+ prior_box, None, target_box, "code_type", code_type,
+ "box_normalized", box_normalized, "axis", axis, "variance",
+ prior_box_var)
+ else:
+ raise TypeError(
+ "Input variance of box_coder must be Variable or list")
+ return output_box
+ else:
+ helper = LayerHelper("box_coder", **locals())
+
+ output_box = helper.create_variable_for_type_inference(
+ dtype=prior_box.dtype)
+
+ inputs = {"PriorBox": prior_box, "TargetBox": target_box}
+ attrs = {
+ "code_type": code_type,
+ "box_normalized": box_normalized,
+ "axis": axis
+ }
+ if isinstance(prior_box_var, Variable):
+ inputs['PriorBoxVar'] = prior_box_var
+ elif isinstance(prior_box_var, list):
+ attrs['variance'] = prior_box_var
+ else:
+ raise TypeError(
+ "Input variance of box_coder must be Variable or list")
+ helper.append_op(
+ type="box_coder",
+ inputs=inputs,
+ attrs=attrs,
+ outputs={"OutputBox": output_box})
+ return output_box
+
+
+@paddle.jit.not_to_static
+def generate_proposals(scores,
+ bbox_deltas,
+ im_shape,
+ anchors,
+ variances,
+ pre_nms_top_n=6000,
+ post_nms_top_n=1000,
+ nms_thresh=0.5,
+ min_size=0.1,
+ eta=1.0,
+ pixel_offset=False,
+ return_rois_num=False,
+ name=None):
+ """
+ **Generate proposal Faster-RCNN**
+ This operation proposes RoIs according to each box with their
+ probability to be a foreground object and
+ the box can be calculated by anchors. Bbox_deltais and scores
+ to be an object are the output of RPN. Final proposals
+ could be used to train detection net.
+ For generating proposals, this operation performs following steps:
+ 1. Transposes and resizes scores and bbox_deltas in size of
+ (H*W*A, 1) and (H*W*A, 4)
+ 2. Calculate box locations as proposals candidates.
+ 3. Clip boxes to image
+ 4. Remove predicted boxes with small area.
+ 5. Apply NMS to get final proposals as output.
+ Args:
+ scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents
+ the probability for each box to be an object.
+ N is batch size, A is number of anchors, H and W are height and
+ width of the feature map. The data type must be float32.
+ bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W]
+ represents the difference between predicted box location and
+ anchor location. The data type must be float32.
+ im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the
+ origin image size or input size. The data type can be float32 or
+ float64.
+ anchors(Tensor): A 4-D Tensor represents the anchors with a layout
+ of [H, W, A, 4]. H and W are height and width of the feature map,
+ num_anchors is the box count of each position. Each anchor is
+ in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.
+ variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of
+ [H, W, num_priors, 4]. Each variance is in
+ (xcenter, ycenter, w, h) format. The data type must be float32.
+ pre_nms_top_n(float): Number of total bboxes to be kept per
+ image before NMS. The data type must be float32. `6000` by default.
+ post_nms_top_n(float): Number of total bboxes to be kept per
+ image after NMS. The data type must be float32. `1000` by default.
+ nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default.
+ min_size(float): Remove predicted boxes with either height or
+ width < min_size. The data type must be float32. `0.1` by default.
+ eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
+ `adaptive_threshold = adaptive_threshold * eta` in each iteration.
+ return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's
+ num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
+ the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model.
+ 'False' by default.
+ name(str, optional): For detailed information, please refer
+ to :ref:`api_guide_Name`. Usually name is no need to set and
+ None by default.
+
+ Returns:
+ tuple:
+ A tuple with format ``(rpn_rois, rpn_roi_probs)``.
+ - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
+ - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
+
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ from ppdet.modeling import ops
+ paddle.enable_static()
+ scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')
+ bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')
+ im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32')
+ anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32')
+ variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32')
+ rois, roi_probs = ops.generate_proposals(scores, bbox_deltas,
+ im_shape, anchors, variances)
+ """
+ if in_dygraph_mode():
+ assert return_rois_num, "return_rois_num should be True in dygraph mode."
+ attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
+ 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,
+ 'pixel_offset', pixel_offset)
+ rpn_rois, rpn_roi_probs, rpn_rois_num = core.ops.generate_proposals_v2(
+ scores, bbox_deltas, im_shape, anchors, variances, *attrs)
+ return rpn_rois, rpn_roi_probs, rpn_rois_num
+
+ else:
+ helper = LayerHelper('generate_proposals_v2', **locals())
+
+ check_variable_and_dtype(scores, 'scores', ['float32'],
+ 'generate_proposals_v2')
+ check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'],
+ 'generate_proposals_v2')
+ check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'],
+ 'generate_proposals_v2')
+ check_variable_and_dtype(anchors, 'anchors', ['float32'],
+ 'generate_proposals_v2')
+ check_variable_and_dtype(variances, 'variances', ['float32'],
+ 'generate_proposals_v2')
+
+ rpn_rois = helper.create_variable_for_type_inference(
+ dtype=bbox_deltas.dtype)
+ rpn_roi_probs = helper.create_variable_for_type_inference(
+ dtype=scores.dtype)
+ outputs = {
+ 'RpnRois': rpn_rois,
+ 'RpnRoiProbs': rpn_roi_probs,
+ }
+ if return_rois_num:
+ rpn_rois_num = helper.create_variable_for_type_inference(
+ dtype='int32')
+ rpn_rois_num.stop_gradient = True
+ outputs['RpnRoisNum'] = rpn_rois_num
+
+ helper.append_op(
+ type="generate_proposals_v2",
+ inputs={
+ 'Scores': scores,
+ 'BboxDeltas': bbox_deltas,
+ 'ImShape': im_shape,
+ 'Anchors': anchors,
+ 'Variances': variances
+ },
+ attrs={
+ 'pre_nms_topN': pre_nms_top_n,
+ 'post_nms_topN': post_nms_top_n,
+ 'nms_thresh': nms_thresh,
+ 'min_size': min_size,
+ 'eta': eta,
+ 'pixel_offset': pixel_offset
+ },
+ outputs=outputs)
+ rpn_rois.stop_gradient = True
+ rpn_roi_probs.stop_gradient = True
+
+ return rpn_rois, rpn_roi_probs, rpn_rois_num
+
+
+def sigmoid_cross_entropy_with_logits(input,
+ label,
+ ignore_index=-100,
+ normalize=False):
+ output = F.binary_cross_entropy_with_logits(input, label, reduction='none')
+ mask_tensor = paddle.cast(label != ignore_index, 'float32')
+ output = paddle.multiply(output, mask_tensor)
+ if normalize:
+ sum_valid_mask = paddle.sum(mask_tensor)
+ output = output / sum_valid_mask
+ return output
+
+
+def smooth_l1(input, label, inside_weight=None, outside_weight=None,
+ sigma=None):
+ input_new = paddle.multiply(input, inside_weight)
+ label_new = paddle.multiply(label, inside_weight)
+ delta = 1 / (sigma * sigma)
+ out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta)
+ out = paddle.multiply(out, outside_weight)
+ out = out / delta
+ out = paddle.reshape(out, shape=[out.shape[0], -1])
+ out = paddle.sum(out, axis=1)
+ return out
diff --git a/paddlevideo/modeling/heads/pptimesformer_head.py b/paddlevideo/modeling/heads/pptimesformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..113bde8b56835cb3fbd21b208c92c9b0127a1ccd
--- /dev/null
+++ b/paddlevideo/modeling/heads/pptimesformer_head.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+
+@HEADS.register()
+class ppTimeSformerHead(BaseHead):
+ """TimeSformerHead Head.
+
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channles in input feature.
+ loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+ std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+ kwargs (dict, optional): Any keyword argument to initialize.
+
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ loss_cfg=dict(name='CrossEntropyLoss'),
+ std=0.02,
+ **kwargs):
+
+ super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+ self.std = std
+ self.fc = Linear(self.in_channels,
+ self.num_classes,
+ bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+ def init_weights(self):
+ """Initiate the FC layer parameters"""
+
+ weight_init_(self.fc,
+ 'TruncatedNormal',
+ 'fc_0.w_0',
+ 'fc_0.b_0',
+ mean=0.0,
+ std=self.std)
+ # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+ trunc_normal_(self.fc.weight, std=self.std)
+
+ def forward(self, x):
+ """Define how the head is going to run.
+ Args:
+ x (paddle.Tensor): The input data.
+ Returns:
+ score: (paddle.Tensor) The classification scores for input samples.
+ """
+ # XXX: check dropout location!
+ # x.shape = [N, embed_dim]
+
+ score = self.fc(x)
+ # [N, num_class]
+ # x = F.softmax(x) # NOTE remove
+ return score
diff --git a/paddlevideo/modeling/heads/pptsm_head.py b/paddlevideo/modeling/heads/pptsm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..88ad2a8e50415badf4f2b4c0616aa6fc41c42785
--- /dev/null
+++ b/paddlevideo/modeling/heads/pptsm_head.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn import Linear
+from paddle.regularizer import L2Decay
+from .tsn_head import TSNHead
+from ..registry import HEADS
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class ppTSMHead(TSNHead):
+ """ ppTSM Head
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channles in input feature.
+ loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+ drop_ratio(float): drop ratio. Default: 0.8.
+ std(float): Std(Scale) value in normal initilizar. Default: 0.001.
+ kwargs (dict, optional): Any keyword argument to initialize.
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ drop_ratio=0.8,
+ std=0.01,
+ data_format="NCHW",
+ **kwargs):
+
+ super().__init__(num_classes,
+ in_channels,
+ drop_ratio=drop_ratio,
+ std=std,
+ data_format=data_format,
+ **kwargs)
+
+ self.fc = Linear(self.in_channels,
+ self.num_classes,
+ weight_attr=ParamAttr(learning_rate=5.0,
+ regularizer=L2Decay(1e-4)),
+ bias_attr=ParamAttr(learning_rate=10.0,
+ regularizer=L2Decay(0.0)))
+ self.stdv = std
+
+ def init_weights(self):
+ """Initiate the FC layer parameters"""
+ weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+ def forward(self, x, num_seg):
+ """Define how the head is going to run.
+ Args:
+ x (paddle.Tensor): The input data.
+ num_segs (int): Number of segments.
+ Returns:
+ score: (paddle.Tensor) The classification scores for input samples.
+ """
+
+ #XXX: check dropout location!
+ # [N * num_segs, in_channels, 7, 7]
+ x = self.avgpool2d(x)
+ # [N * num_segs, in_channels, 1, 1]
+ if self.dropout is not None:
+ x = self.dropout(x)
+ # [N * num_seg, in_channels, 1, 1]
+ x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+ # [N, num_seg, in_channels]
+ x = paddle.mean(x, axis=1)
+ # [N, in_channels]
+ x = paddle.reshape(x, shape=[-1, self.in_channels])
+ # [N, in_channels]
+ score = self.fc(x)
+ # [N, num_class]
+ #x = F.softmax(x) #NOTE remove
+ return score
diff --git a/paddlevideo/modeling/heads/pptsn_head.py b/paddlevideo/modeling/heads/pptsn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..44314ac7de0c5eb63e965994fd7a4f29f3eeab46
--- /dev/null
+++ b/paddlevideo/modeling/heads/pptsn_head.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+from paddle.regularizer import L2Decay
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class ppTSNHead(BaseHead):
+ """ppTSN Head.
+
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channles in input feature.
+ loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+ drop_ratio(float): drop ratio. Default: 0.4.
+ std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+ data_format(str): data format of input tensor in ['NCHW', 'NHWC']. Default: 'NCHW'.
+ fclr5(bool): Whether to increase the learning rate of the fully connected layer. Default: True
+ kwargs (dict, optional): Any keyword argument to initialize.
+
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ loss_cfg=dict(name='CrossEntropyLoss'),
+ drop_ratio=0.4,
+ std=0.01,
+ data_format="NCHW",
+ fclr5=True,
+ **kwargs):
+
+ super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+ self.drop_ratio = drop_ratio
+ self.std = std
+
+ # NOTE: global pool performance
+ self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)
+
+ if self.drop_ratio != 0:
+ self.dropout = Dropout(p=self.drop_ratio)
+ else:
+ self.dropout = None
+ self.fc = Linear(
+ self.in_channels,
+ self.num_classes,
+ weight_attr=ParamAttr(learning_rate=5.0 if fclr5 else 1.0,
+ regularizer=L2Decay(1e-4)),
+ bias_attr=ParamAttr(learning_rate=10.0 if fclr5 else 1.0,
+ regularizer=L2Decay(0.0)))
+
+ def init_weights(self):
+ """Initiate the FC layer parameters"""
+ weight_init_(self.fc,
+ 'Normal',
+ 'fc_0.w_0',
+ 'fc_0.b_0',
+ mean=0.,
+ std=self.std)
+
+ def forward(self, x, num_seg):
+ """Define how the head is going to run.
+
+ Args:
+ x (paddle.Tensor): The input data.
+ num_segs (int): Number of segments.
+ Returns:
+ score: (paddle.Tensor) The classification scores for input samples.
+ """
+
+ # XXX: check dropout location!
+ # [N * num_segs, in_channels, 7, 7]
+ x = self.avgpool2d(x)
+ # [N * num_segs, in_channels, 1, 1]
+ x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+ # [N, num_seg, in_channels]
+ x = paddle.mean(x, axis=1)
+ # [N, in_channels]
+ if self.dropout is not None:
+ x = self.dropout(x)
+ # [N, in_channels]
+ x = paddle.reshape(x, shape=[-1, self.in_channels])
+ # [N, in_channels]
+ score = self.fc(x)
+ # [N, num_class]
+ # x = F.softmax(x) # NOTE remove
+ return score
diff --git a/paddlevideo/modeling/heads/roi_extractor.py b/paddlevideo/modeling/heads/roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a6b93beac0a7c6f635e673696c233c146dade80
--- /dev/null
+++ b/paddlevideo/modeling/heads/roi_extractor.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from . import ops
+
+
+#@register
+class RoIAlign(object):
+
+ def __init__(self,
+ resolution=14,
+ spatial_scale=0.0625,
+ sampling_ratio=0,
+ aligned=False):
+ super(RoIAlign, self).__init__()
+ self.resolution = resolution
+ self.spatial_scale = spatial_scale
+ self.sampling_ratio = sampling_ratio
+ self.aligned = aligned
+
+ def __call__(self, feats, roi, rois_num):
+ roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
+ rois_num = paddle.to_tensor(rois_num, dtype='int32')
+ rois_num = paddle.cast(rois_num, dtype='int32')
+ if len(feats) == 1:
+ roi_feat = ops.roi_align(feats,
+ roi,
+ self.resolution,
+ self.spatial_scale,
+ sampling_ratio=self.sampling_ratio,
+ rois_num=rois_num,
+ aligned=self.aligned)
+ else:
+ rois_feat_list = []
+ roi_feat = ops.roi_align(feats,
+ roi,
+ self.resolution,
+ self.spatial_scale,
+ sampling_ratio=self.sampling_ratio,
+ rois_num=rois_num,
+ aligned=self.aligned)
+
+ return roi_feat
diff --git a/paddlevideo/modeling/heads/roi_head.py b/paddlevideo/modeling/heads/roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..be34a33efdb8c9fbbbeacc5f03d26e4cd72f0527
--- /dev/null
+++ b/paddlevideo/modeling/heads/roi_head.py
@@ -0,0 +1,177 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from .. import builder
+from ..registry import HEADS
+
+
+def bbox2result(bboxes, labels, num_classes, img_shape, thr=0.01):
+ """Convert detection results to a list of numpy arrays. """
+ if len(bboxes) == 0:
+ return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32))
+ else:
+ bboxes = bboxes[0]
+ labels = labels
+ img_shape_np = img_shape
+ img_h, img_w = img_shape_np[0][0], img_shape_np[0][1]
+
+ img_w = paddle.cast(img_w, dtype='int32')
+ img_h = paddle.cast(img_h, dtype='int32')
+
+ bboxes[:, 0::2] /= img_w
+ bboxes[:, 1::2] /= img_h
+
+ # We only handle multilabel now
+ assert labels.shape[-1] > 1
+
+ scores = labels # rename
+ thr = (thr, ) * num_classes if isinstance(thr, float) else thr
+ assert scores.shape[1] == num_classes
+ assert len(thr) == num_classes
+
+ result = []
+ for i in range(num_classes - 1):
+ #step1. 对该类, 每个bbox的得分是否大于阈值
+ where = scores[:, i + 1] > thr[i + 1]
+
+ where = paddle.nonzero(where) # index
+ bboxes_select = paddle.index_select(x=bboxes, index=where)
+ bboxes_select = bboxes_select[:, :4]
+
+ scores_select = paddle.index_select(x=scores, index=where)
+ scores_select = scores_select[:, i + 1:i + 2]
+
+ result.append(
+ #对于step1中得分大于阈值的bbox(可能为空), 将bbox及在该类的score放入result列表.
+ paddle.concat((bboxes_select, scores_select), axis=1).numpy())
+
+ return result
+
+
+@HEADS.register()
+class AVARoIHead(nn.Layer):
+
+ def __init__(self,
+ assigner,
+ sampler,
+ pos_weight=1.0,
+ action_thr=0.0,
+ bbox_roi_extractor=None,
+ bbox_head=None,
+ train_cfg=None,
+ test_cfg=None):
+ super().__init__()
+ self.assigner = assigner
+ self.sampler = sampler
+ self.pos_weight = pos_weight
+ self.action_thr = action_thr
+ self.init_assigner_sampler()
+ if bbox_head is not None:
+ self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+ def init_assigner_sampler(self):
+ """Initialize assigner and sampler."""
+ self.bbox_assigner = None
+ self.bbox_sampler = None
+ self.bbox_assigner = builder.build_assigner(self.assigner)
+ self.bbox_sampler = builder.build_sampler(self.sampler, context=self)
+
+ def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+ """Initialize ``bbox_head``"""
+ self.bbox_roi_extractor = builder.build_roi_extractor(
+ bbox_roi_extractor)
+ self.bbox_head = builder.build_head(bbox_head)
+
+ def _bbox_forward(self, x, rois, rois_num):
+ bbox_feat = self.bbox_roi_extractor(x, rois, rois_num)
+ cls_score, bbox_pred = self.bbox_head(
+ bbox_feat, rois, rois_num
+ ) #deal with: when roi's width or height = 0 , roi_align is wrong
+ bbox_results = dict(cls_score=cls_score,
+ bbox_pred=bbox_pred,
+ bbox_feats=bbox_feat)
+ return bbox_results
+
+ def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels):
+ """Run forward function and calculate loss for box head in training."""
+ rois = [res.bboxes for res in sampling_results]
+ rois_num = [res.bboxes.shape[0] for res in sampling_results]
+ bbox_results = self._bbox_forward(x, rois, rois_num)
+ bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+ gt_labels, self.pos_weight)
+ loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_targets)
+ bbox_results.update(loss_bbox=loss_bbox)
+ return bbox_results
+
+ def train_step(self, x, img_metas, proposal_list, gt_bboxes, gt_labels):
+ #1. assign gts and sample proposals
+ num_imgs = len(img_metas[0])
+ sampling_results = []
+ for i in range(num_imgs):
+ assign_result = self.bbox_assigner.assign(proposal_list[i],
+ gt_bboxes[i],
+ gt_labels[i])
+ sampling_result = self.bbox_sampler.sample(assign_result,
+ proposal_list[i],
+ gt_bboxes[i],
+ gt_labels[i])
+ sampling_results.append(sampling_result)
+
+ #2. forward and loss
+ bbox_results = self._bbox_forward_train(x, sampling_results, gt_bboxes,
+ gt_labels)
+ losses = dict()
+ losses.update(bbox_results['loss_bbox'])
+
+ return losses
+
+ def simple_test(self, x, proposal_list, img_shape, rescale=False):
+ x_shape = x[0].shape
+ #assert x_shape[0] == 1, 'only accept 1 sample at test mode'
+
+ det_bboxes, det_labels = self.simple_test_bboxes(x,
+ img_shape,
+ proposal_list,
+ self.action_thr,
+ rescale=rescale)
+
+ bbox_results = bbox2result(det_bboxes, det_labels,
+ self.bbox_head.num_classes, img_shape,
+ self.action_thr)
+ return [bbox_results]
+
+ def simple_test_bboxes(self,
+ x,
+ img_shape,
+ proposals,
+ action_thr,
+ rescale=False):
+ """Test only det bboxes without augmentation."""
+ rois = [proposals]
+ rois_num = [rois[0].shape[0]]
+ bbox_results = self._bbox_forward(x, rois, rois_num)
+ cls_score = bbox_results['cls_score']
+ crop_quadruple = np.array([0, 0, 1, 1])
+ flip = False
+ det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+ rois,
+ cls_score,
+ img_shape,
+ flip=flip,
+ crop_quadruple=crop_quadruple)
+
+ return det_bboxes, det_labels
diff --git a/paddlevideo/modeling/heads/single_straight3d.py b/paddlevideo/modeling/heads/single_straight3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf8569dd45e37b36d5a957509a53c680284aa042
--- /dev/null
+++ b/paddlevideo/modeling/heads/single_straight3d.py
@@ -0,0 +1,80 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import numpy as np
+from ..registry import ROI_EXTRACTORS
+from .roi_extractor import RoIAlign
+
+
+@ROI_EXTRACTORS.register()
+class SingleRoIExtractor3D(nn.Layer):
+ """Extract RoI features from a single level feature map. """
+
+ def __init__(self,
+ roi_layer_type='RoIAlign',
+ featmap_stride=16,
+ output_size=16,
+ sampling_ratio=0,
+ pool_mode='avg',
+ aligned=True,
+ with_temporal_pool=True,
+ with_global=False):
+ super().__init__()
+ self.roi_layer_type = roi_layer_type
+ assert self.roi_layer_type in ['RoIPool', 'RoIAlign']
+ self.featmap_stride = featmap_stride
+ self.spatial_scale = 1. / self.featmap_stride
+ self.output_size = output_size
+ self.sampling_ratio = sampling_ratio
+ self.pool_mode = pool_mode
+ self.aligned = aligned
+ self.with_temporal_pool = with_temporal_pool
+ self.with_global = with_global
+
+ self.roi_layer = RoIAlign(resolution=self.output_size,
+ spatial_scale=self.spatial_scale,
+ sampling_ratio=self.sampling_ratio,
+ aligned=self.aligned)
+
+ def init_weights(self):
+ pass
+
+ # The shape of feat is N, C, T, H, W
+ def forward(self, feat, rois, rois_num):
+ if len(feat) >= 2:
+ assert self.with_temporal_pool
+ if self.with_temporal_pool:
+ xi = 0
+ for x in feat:
+ xi = xi + 1
+ y = paddle.mean(x, 2, keepdim=True)
+ feat = [paddle.mean(x, 2, keepdim=True) for x in feat]
+ feat = paddle.concat(feat, axis=1) # merge slow and fast
+ roi_feats = []
+ for t in range(feat.shape[2]):
+ if type(t) == paddle.fluid.framework.Variable:
+ index = paddle.to_tensor(t)
+ else:
+ data_index = np.array([t]).astype('int32')
+ index = paddle.to_tensor(data_index)
+
+ frame_feat = paddle.index_select(feat, index, axis=2)
+ frame_feat = paddle.squeeze(frame_feat,
+ axis=2) #axis=2,避免N=1时, 第一维度被删除.
+ roi_feat = self.roi_layer(frame_feat, rois, rois_num)
+ roi_feats.append(roi_feat)
+
+ ret = paddle.stack(roi_feats, axis=2)
+ return ret
diff --git a/paddlevideo/modeling/heads/slowfast_head.py b/paddlevideo/modeling/heads/slowfast_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd18bafda10d00ef71d860e0f14bd32c1bbeb038
--- /dev/null
+++ b/paddlevideo/modeling/heads/slowfast_head.py
@@ -0,0 +1,137 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..registry import HEADS
+from .base import BaseHead
+
+import paddle
+import paddle.nn.functional as F
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class SlowFastHead(BaseHead):
+ """
+ ResNe(X)t 3D head.
+ This layer performs a fully-connected projection during training, when the
+ input size is 1x1x1. It performs a convolutional projection during testing
+ when the input size is larger than 1x1x1. If the inputs are from multiple
+ different pathways, the inputs will be concatenated after pooling.
+ """
+ def __init__(self,
+ width_per_group,
+ alpha,
+ beta,
+ num_classes,
+ num_frames,
+ crop_size,
+ dropout_rate,
+ pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+ loss_cfg=dict(name='CrossEntropyLoss'),
+ multigrid_short=False,
+ **kwargs):
+ """
+ ResNetBasicHead takes p pathways as input where p in [1, infty].
+
+ Args:
+ dim_in (list): the list of channel dimensions of the p inputs to the
+ ResNetHead.
+ num_classes (int): the channel dimensions of the p outputs to the
+ ResNetHead.
+ pool_size (list): the list of kernel sizes of p spatial temporal
+ poolings, temporal pool kernel size, spatial pool kernel size,
+ spatial pool kernel size in order.
+ dropout_rate (float): dropout rate. If equal to 0.0, perform no
+ dropout.
+ """
+ super().__init__(num_classes, loss_cfg, **kwargs)
+ self.multigrid_short = multigrid_short
+ self.width_per_group = width_per_group
+ self.alpha = alpha
+ self.beta = beta
+ self.num_classes = num_classes
+ self.num_frames = num_frames
+ self.crop_size = crop_size
+ self.dropout_rate = dropout_rate
+ self.pool_size_ratio = pool_size_ratio
+
+ self.dim_in = [
+ self.width_per_group * 32,
+ self.width_per_group * 32 // self.beta,
+ ]
+ self.pool_size = [None, None] if self.multigrid_short else [
+ [
+ self.num_frames // self.alpha // self.pool_size_ratio[0][0],
+ self.crop_size // 32 // self.pool_size_ratio[0][1],
+ self.crop_size // 32 // self.pool_size_ratio[0][2],
+ ],
+ [
+ self.num_frames // self.pool_size_ratio[1][0],
+ self.crop_size // 32 // self.pool_size_ratio[1][1],
+ self.crop_size // 32 // self.pool_size_ratio[1][2],
+ ],
+ ]
+
+ assert (len({len(self.pool_size), len(self.dim_in)
+ }) == 1), "pathway dimensions are not consistent."
+ self.num_pathways = len(self.pool_size)
+
+ self.dropout = paddle.nn.Dropout(p=self.dropout_rate)
+
+ self.projection = paddle.nn.Linear(
+ in_features=sum(self.dim_in),
+ out_features=self.num_classes,
+ )
+
+ def init_weights(self):
+ weight_init_(self.projection,
+ "Normal",
+ bias_value=0.0,
+ mean=0.0,
+ std=0.01)
+
+ def forward(self, inputs):
+ assert (len(inputs) == self.num_pathways
+ ), "Input tensor does not contain {} pathway".format(
+ self.num_pathways)
+ pool_out = []
+ for pathway in range(self.num_pathways):
+ if self.pool_size[pathway] is None:
+ tmp_out = F.adaptive_avg_pool3d(x=inputs[pathway],
+ output_size=(1, 1, 1),
+ data_format="NCDHW")
+ else:
+ tmp_out = F.avg_pool3d(x=inputs[pathway],
+ kernel_size=self.pool_size[pathway],
+ stride=1,
+ data_format="NCDHW")
+ pool_out.append(tmp_out)
+
+ x = paddle.concat(x=pool_out, axis=1)
+ x = paddle.transpose(x=x, perm=(0, 2, 3, 4, 1))
+
+ # Perform dropout.
+ if self.dropout_rate > 0.0:
+ x = self.dropout(x)
+
+ x = self.projection(x)
+
+ # Performs fully convlutional inference.
+ if not self.training: # attr of base class
+ x = F.softmax(x, axis=4)
+ x = paddle.mean(x, axis=[1, 2, 3])
+
+ x = paddle.reshape(x, shape=(x.shape[0], -1))
+ return x
diff --git a/paddlevideo/modeling/heads/stgcn_head.py b/paddlevideo/modeling/heads/stgcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc80d6633378e4a820509d9eacf73a54f5823d2b
--- /dev/null
+++ b/paddlevideo/modeling/heads/stgcn_head.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class STGCNHead(BaseHead):
+ """
+ Head for ST-GCN model.
+ Args:
+ in_channels: int, input feature channels. Default: 256.
+ num_classes: int, number classes. Default: 10.
+ """
+ def __init__(self, in_channels=256, num_classes=10, **kwargs):
+ super().__init__(num_classes, in_channels, **kwargs)
+ self.fcn = nn.Conv2D(in_channels=in_channels,
+ out_channels=num_classes,
+ kernel_size=1)
+
+ def init_weights(self):
+ """Initiate the parameters.
+ """
+ for layer in self.sublayers():
+ if isinstance(layer, nn.Conv2D):
+ weight_init_(layer, 'Normal', std=0.02)
+
+ def forward(self, x):
+ """Define how the head is going to run.
+ """
+ x = self.fcn(x)
+ x = paddle.reshape_(x, (x.shape[0], -1)) # N,C,1,1 --> N,C
+
+ return x
diff --git a/paddlevideo/modeling/heads/timesformer_head.py b/paddlevideo/modeling/heads/timesformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d02a3cca8fc4f3e1de06832f8185f7718e42a179
--- /dev/null
+++ b/paddlevideo/modeling/heads/timesformer_head.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class TimeSformerHead(BaseHead):
+ """TimeSformerHead Head.
+
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channles in input feature.
+ loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+ std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+ kwargs (dict, optional): Any keyword argument to initialize.
+
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ loss_cfg=dict(name='CrossEntropyLoss'),
+ std=0.02,
+ **kwargs):
+
+ super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+ self.std = std
+ self.fc = Linear(self.in_channels, self.num_classes)
+
+ def init_weights(self):
+ """Initiate the FC layer parameters"""
+
+ weight_init_(self.fc,
+ 'TruncatedNormal',
+ 'fc_0.w_0',
+ 'fc_0.b_0',
+ mean=0.0,
+ std=self.std)
+ # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+ trunc_normal_(self.fc.weight, std=self.std)
+
+ def forward(self, x):
+ """Define how the head is going to run.
+ Args:
+ x (paddle.Tensor): The input data.
+ Returns:
+ score: (paddle.Tensor) The classification scores for input samples.
+ """
+ # XXX: check dropout location!
+ # x.shape = [N, embed_dim]
+
+ score = self.fc(x)
+ # [N, num_class]
+ # x = F.softmax(x) # NOTE remove
+ return score
diff --git a/paddlevideo/modeling/heads/transnetv2_head.py b/paddlevideo/modeling/heads/transnetv2_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea67d4d325cc7fa541216fcb6266e3a7f292f1f
--- /dev/null
+++ b/paddlevideo/modeling/heads/transnetv2_head.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..losses import TransNetV2Loss
+from ...metrics.transnetv2_metric import create_scene_based_summaries
+
+@HEADS.register()
+class TransNetV2Head(BaseHead):
+ """TransNetV2 Head.
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ loss_cfg=dict(name="TransNetV2Loss")
+ ):
+ super().__init__(num_classes,
+ in_channels,
+ loss_cfg)
+
+ def loss(self, one_hot_pred, one_hot_gt,
+ many_hot_pred=None, many_hot_gt=None, reg_losses=None):
+ losses = dict()
+ loss = self.loss_func(scores, labels, **kwargs)
+
+ f1 = self.get_score(one_hot_pred, one_hot_gt)
+ losses['f1'] = f1
+ losses['loss'] = loss
+ return losses
+
+ def get_score(self, one_hot_pred, one_hot_gt):
+ f1 = create_scene_based_summaries(one_hot_pred, one_hot_gt)
+ return f1
diff --git a/paddlevideo/modeling/heads/tsm_head.py b/paddlevideo/modeling/heads/tsm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..955930168208d609f1dde9511c807048594d1fb6
--- /dev/null
+++ b/paddlevideo/modeling/heads/tsm_head.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+from paddle import ParamAttr
+from paddle.nn import Linear
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from .tsn_head import TSNHead
+from ..registry import HEADS
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class TSMHead(TSNHead):
+ """ TSM Head
+
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channles in input feature.
+ loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+ drop_ratio(float): drop ratio. Default: 0.5.
+ std(float): Std(Scale) value in normal initilizar. Default: 0.001.
+ kwargs (dict, optional): Any keyword argument to initialize.
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ drop_ratio=0.5,
+ std=0.001,
+ data_format="NCHW",
+ **kwargs):
+ super().__init__(num_classes,
+ in_channels,
+ drop_ratio=drop_ratio,
+ std=std,
+ data_format=data_format,
+ **kwargs)
+
+ self.fc = Linear(self.in_channels,
+ self.num_classes,
+ weight_attr=ParamAttr(learning_rate=5.0,
+ regularizer=L2Decay(1e-4)),
+ bias_attr=ParamAttr(learning_rate=10.0,
+ regularizer=L2Decay(0.0)))
+
+ assert (data_format in [
+ 'NCHW', 'NHWC'
+ ]), f"data_format must be 'NCHW' or 'NHWC', but got {data_format}"
+
+ self.data_format = data_format
+
+ self.stdv = std
+
+ def init_weights(self):
+ """Initiate the FC layer parameters"""
+ weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+ def forward(self, x, num_seg):
+ """Define how the tsm-head is going to run.
+
+ Args:
+ x (paddle.Tensor): The input data.
+ num_segs (int): Number of segments.
+ Returns:
+ score: (paddle.Tensor) The classification scores for input samples.
+ """
+ # x.shape = [N * num_segs, in_channels, 7, 7]
+
+ x = self.avgpool2d(x) # [N * num_segs, in_channels, 1, 1]
+
+ if self.dropout is not None:
+ x = self.dropout(x) # [N * num_seg, in_channels, 1, 1]
+
+ if self.data_format == 'NCHW':
+ x = paddle.reshape(x, x.shape[:2])
+ else:
+ x = paddle.reshape(x, x.shape[::3])
+ score = self.fc(x) # [N * num_seg, num_class]
+ score = paddle.reshape(
+ score, [-1, num_seg, score.shape[1]]) # [N, num_seg, num_class]
+ score = paddle.mean(score, axis=1) # [N, num_class]
+ score = paddle.reshape(score,
+ shape=[-1, self.num_classes]) # [N, num_class]
+ # score = F.softmax(score) #NOTE remove
+ return score
diff --git a/paddlevideo/modeling/heads/tsn_head.py b/paddlevideo/modeling/heads/tsn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2f906bce9240b6fcd79b9db90e271366a64e270
--- /dev/null
+++ b/paddlevideo/modeling/heads/tsn_head.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class TSNHead(BaseHead):
+ """TSN Head.
+
+ Args:
+ num_classes (int): The number of classes to be classified.
+ in_channels (int): The number of channles in input feature.
+ loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+ drop_ratio(float): drop ratio. Default: 0.4.
+ std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+ kwargs (dict, optional): Any keyword argument to initialize.
+
+ """
+ def __init__(self,
+ num_classes,
+ in_channels,
+ loss_cfg=dict(name='CrossEntropyLoss'),
+ drop_ratio=0.4,
+ std=0.01,
+ data_format="NCHW",
+ **kwargs):
+
+ super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+ self.drop_ratio = drop_ratio
+ self.std = std
+
+ #NOTE: global pool performance
+ self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)
+
+ if self.drop_ratio != 0:
+ self.dropout = Dropout(p=self.drop_ratio)
+ else:
+ self.dropout = None
+
+ self.fc = Linear(self.in_channels, self.num_classes)
+
+ def init_weights(self):
+ """Initiate the FC layer parameters"""
+
+ weight_init_(self.fc,
+ 'Normal',
+ 'fc_0.w_0',
+ 'fc_0.b_0',
+ mean=0.,
+ std=self.std)
+
+ def forward(self, x, num_seg):
+ """Define how the head is going to run.
+ Args:
+ x (paddle.Tensor): The input data.
+ num_segs (int): Number of segments.
+ Returns:
+ score: (paddle.Tensor) The classification scores for input samples.
+ """
+
+ #XXX: check dropout location!
+ # [N * num_segs, in_channels, 7, 7]
+
+ x = self.avgpool2d(x)
+ # [N * num_segs, in_channels, 1, 1]
+ x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+ # [N, num_seg, in_channels]
+ x = paddle.mean(x, axis=1)
+ # [N, in_channels]
+ if self.dropout is not None:
+ x = self.dropout(x)
+ # [N, in_channels]
+ score = self.fc(x)
+ # [N, num_class]
+ #x = F.softmax(x) #NOTE remove
+ return score
diff --git a/paddlevideo/modeling/losses/__init__.py b/paddlevideo/modeling/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1410b95b6df724a083b2ba09c02c137fe916c0d9
--- /dev/null
+++ b/paddlevideo/modeling/losses/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseWeightedLoss
+from .bmn_loss import BMNLoss
+from .cross_entropy_loss import CrossEntropyLoss
+from .depth_loss import ADDSLoss
+from .transnetv2_loss import TransNetV2Loss
+from .actbert_loss import ActBertLoss
+from .asrf_loss import ASRFLoss
+
+__all__ = [
+ 'CrossEntropyLoss', 'BMNLoss', 'TransNetV2Loss', 'ActBertLoss', 'ADDSLoss',
+ 'BaseWeightedLoss', 'ASRFLoss'
+]
diff --git a/paddlevideo/modeling/losses/actbert_loss.py b/paddlevideo/modeling/losses/actbert_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ffea6e650097420cba320c4e53e75a1cbfa21f
--- /dev/null
+++ b/paddlevideo/modeling/losses/actbert_loss.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class ActBertLoss(BaseWeightedLoss):
+ """Loss for ActBert model
+ """
+ def __init__(self, vocab_size=30522, a_target_size=700):
+ super().__init__()
+ self.vocab_size = vocab_size
+ self.a_target_size = a_target_size
+ self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+ self.vis_criterion = nn.KLDivLoss(reduction="none")
+
+ def forward(self, prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \
+ text_labels, image_label, image_target, action_label, next_sentence_label):
+ """
+ Args:
+ text_label: text label(with mask). Shape: [batch_size, seqence_length]
+ image_label: image label(with mask). Shape: [batch_size, region_length]
+ image_target: label of image feature distribution,
+ Shape: [batch_size, region_length-1, num_image_class](minus 1 for xxx).
+ action label: action label(with mask), Shape: [batch_size, action_length]
+ next_sentence_label: is next sentence or not. Shape: [batch_size]
+ """
+ prediction_scores_v = prediction_scores_v[:,
+ 1:] #8,37,1601 --> 8,36,1601
+
+ img_loss = self.vis_criterion(
+ F.log_softmax(prediction_scores_v, axis=2),
+ image_target #8,36,1601
+ )
+ masked_img_loss = paddle.sum(
+ img_loss * (image_label == 1).unsqueeze(2).astype('float32')) / max(
+ paddle.sum((image_label == 1).astype('float32')), 1e-6)
+
+ masked_text_loss = self.loss_fct(
+ prediction_scores_t.reshape([-1, self.vocab_size]), #8,36,30522
+ text_labels.reshape([-1]), #8,36 # label -1 will be ignored
+ )
+
+ masked_action_loss = self.loss_fct(
+ prediction_scores_a.reshape([-1, self.a_target_size]), #8,5,700
+ action_label.reshape([-1]), #8,5
+ )
+
+ next_sentence_loss = self.loss_fct(
+ seq_relationship_score.reshape([-1, 2]),
+ next_sentence_label.reshape([-1]) #8,2
+ )
+
+ total_loss = masked_text_loss.unsqueeze(0) + masked_img_loss.unsqueeze(
+ 0) + masked_action_loss.unsqueeze(0) + next_sentence_loss.unsqueeze(
+ 0)
+ return total_loss
diff --git a/paddlevideo/modeling/losses/asrf_loss.py b/paddlevideo/modeling/losses/asrf_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce5d6b1adc25fb38ea5f973aa1f33cd11e32752a
--- /dev/null
+++ b/paddlevideo/modeling/losses/asrf_loss.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yiskw713/asrf/libs/loss_fn/__init__.py
+
+import numpy as np
+import pandas as pd
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import sys
+import os
+
+from ..registry import LOSSES
+
+
+class TMSE(nn.Layer):
+ """
+ Temporal MSE Loss Function
+ Proposed in Y. A. Farha et al. MS-TCN: Multi-Stage Temporal Convolutional Network for ActionSegmentation in CVPR2019
+ arXiv: https://arxiv.org/pdf/1903.01945.pdf
+ """
+
+ def __init__(self, threshold=4, ignore_index=255):
+ super().__init__()
+ self.threshold = threshold
+ self.ignore_index = ignore_index
+ self.mse = nn.MSELoss(reduction="none")
+
+ def forward(self, preds, gts):
+
+ total_loss = 0.0
+ batch_size = preds.shape[0]
+ for pred, gt in zip(preds, gts):
+ pred = paddle.gather(pred,
+ paddle.nonzero(gt != self.ignore_index)[:, 0])
+
+ loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),
+ F.log_softmax(pred[:, :-1], axis=1))
+
+ loss = paddle.clip(loss, min=0, max=self.threshold**2)
+ total_loss += paddle.mean(loss)
+
+ return total_loss / batch_size
+
+
+class GaussianSimilarityTMSE(nn.Layer):
+ """
+ Temporal MSE Loss Function with Gaussian Similarity Weighting
+ """
+
+ def __init__(self, threshold=4, sigma=1.0, ignore_index=255):
+ super().__init__()
+ self.threshold = threshold
+ self.ignore_index = ignore_index
+ self.mse = nn.MSELoss(reduction="none")
+ self.sigma = sigma
+
+ def forward(self, preds, gts, sim_index):
+ """
+ Args:
+ preds: the output of model before softmax. (N, C, T)
+ gts: Ground Truth. (N, T)
+ sim_index: similarity index. (N, C, T)
+ Return:
+ the value of Temporal MSE weighted by Gaussian Similarity.
+ """
+ total_loss = 0.0
+ batch_size = preds.shape[0]
+ for pred, gt, sim in zip(preds, gts, sim_index):
+ pred = paddle.gather(pred,
+ paddle.nonzero(gt != self.ignore_index)[:, 0],
+ axis=1)
+ sim = paddle.gather(sim,
+ paddle.nonzero(gt != self.ignore_index)[:, 0],
+ axis=1)
+
+ # calculate gaussian similarity
+ diff = sim[:, 1:] - sim[:, :-1]
+ similarity = paddle.exp(
+ (-1 * paddle.norm(diff, axis=0)) / (2 * self.sigma**2))
+
+ # calculate temporal mse
+ loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),
+ F.log_softmax(pred[:, :-1], axis=1))
+ loss = paddle.clip(loss, min=0, max=self.threshold**2)
+
+ # gaussian similarity weighting
+ loss = similarity * loss
+
+ total_loss += paddle.mean(loss)
+
+ return total_loss / batch_size
+
+
+class FocalLoss(nn.Layer):
+
+ def __init__(self,
+ weight=None,
+ size_average=True,
+ batch_average=True,
+ ignore_index=255,
+ gamma=2.0,
+ alpha=0.25):
+ super().__init__()
+
+ self.gamma = gamma
+ self.alpha = alpha
+ self.batch_average = batch_average
+ self.criterion = nn.CrossEntropyLoss(weight=weight,
+ ignore_index=ignore_index,
+ size_average=size_average)
+
+ def forward(self, logit, target):
+ n, _, _ = logit.size()
+
+ logpt = -self.criterion(logit, target.long())
+ pt = paddle.exp(logpt)
+
+ if self.alpha is not None:
+ logpt *= self.alpha
+
+ loss = -((1 - pt)**self.gamma) * logpt
+
+ if self.batch_average:
+ loss /= n
+
+ return loss
+
+
+class ActionSegmentationLoss(nn.Layer):
+ """
+ Loss Function for Action Segmentation
+ You can choose the below loss functions and combine them.
+ - Cross Entropy Loss (CE)
+ - Focal Loss
+ - Temporal MSE (TMSE)
+ - Gaussian Similarity TMSE (GSTMSE)
+ """
+
+ def __init__(self,
+ num_classes,
+ file_path,
+ label_path,
+ ce=True,
+ focal=True,
+ tmse=False,
+ gstmse=False,
+ weight=None,
+ threshold=4.,
+ ignore_index=255,
+ ce_weight=1.0,
+ focal_weight=1.0,
+ tmse_weight=0.15,
+ gstmse_weight=0.15):
+ super().__init__()
+ self.criterions = []
+ self.weights = []
+
+ self.num_classes = num_classes
+ self.file_path = file_path
+ self.label_path = label_path
+ if weight:
+ class_weight = self.get_class_weight()
+ else:
+ class_weight = None
+
+ if ce:
+ self.criterions.append(
+ nn.CrossEntropyLoss(weight=class_weight,
+ ignore_index=ignore_index))
+ self.weights.append(ce_weight)
+
+ if focal:
+ self.criterions.append(FocalLoss(ignore_index=ignore_index))
+ self.weights.append(focal_weight)
+
+ if tmse:
+ self.criterions.append(
+ TMSE(threshold=threshold, ignore_index=ignore_index))
+ self.weights.append(tmse_weight)
+
+ if gstmse:
+ self.criterions.append(
+ GaussianSimilarityTMSE(threshold=threshold,
+ ignore_index=ignore_index))
+ self.weights.append(gstmse_weight)
+
+ if len(self.criterions) == 0:
+ print("You have to choose at least one loss function.")
+ sys.exit(1)
+
+ def get_class_weight(self):
+ """
+ Class weight for CrossEntropy
+ Class weight is calculated in the way described in:
+ D. Eigen and R. Fergus, “Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture,” in ICCV,
+ openaccess: https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Eigen_Predicting_Depth_Surface_ICCV_2015_paper.pdf
+ """
+ # load file list
+ file_ptr = open(self.file_path, 'r')
+ info = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+
+ nums = [0 for i in range(self.num_classes)]
+ for i in range(len(info)):
+ video_name = info[i]
+ file_name = video_name.split('.')[0] + ".npy"
+ label_file_path = os.path.join(self.label_path, file_name)
+ label = np.load(label_file_path).astype(np.int64)
+ num, cnt = np.unique(label, return_counts=True)
+ for n, c in zip(num, cnt):
+ nums[n] += c
+
+ class_num = paddle.to_tensor(nums, dtype="float32")
+ total = class_num.sum().item()
+ frequency = class_num / total
+ median = paddle.median(frequency)
+ class_weight = median / frequency
+ return class_weight
+
+ def forward(self, preds, gts, sim_index):
+ """
+ Args:
+ preds: paddle.float (N, C, T).
+ gts: paddle.int64 (N, T).
+ sim_index: paddle.float (N, C', T).
+ """
+ loss = 0.0
+ for criterion, weight in zip(self.criterions, self.weights):
+ if isinstance(criterion, GaussianSimilarityTMSE):
+ loss += weight * criterion(preds, gts, sim_index)
+ elif isinstance(criterion, nn.CrossEntropyLoss):
+ preds_t = paddle.transpose(preds, perm=[0, 2, 1])
+ loss += weight * criterion(preds_t, gts)
+ else:
+ loss += weight * criterion(preds, gts)
+
+ return loss
+
+
+class BoundaryRegressionLoss(nn.Layer):
+ """
+ Boundary Regression Loss
+ bce: Binary Cross Entropy Loss for Boundary Prediction
+ mse: Mean Squared Error
+ """
+
+ def __init__(self,
+ file_path,
+ label_path,
+ bce=True,
+ focal=False,
+ mse=False,
+ weight=None,
+ pos_weight=None):
+ super().__init__()
+
+ self.criterions = []
+ self.file_path = file_path
+ self.label_path = label_path
+
+ pos_weight = self.get_pos_weight()
+
+ if bce:
+ self.criterions.append(
+ nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight))
+
+ if focal:
+ self.criterions.append(FocalLoss())
+
+ if mse:
+ self.criterions.append(nn.MSELoss())
+
+ if len(self.criterions) == 0:
+ print("You have to choose at least one loss function.")
+ sys.exit(1)
+
+ def get_pos_weight(self, norm=None):
+ """
+ pos_weight for binary cross entropy with logits loss
+ pos_weight is defined as reciprocal of ratio of positive samples in the dataset
+ """
+ # load file list
+ file_ptr = open(self.file_path, 'r')
+ info = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+
+ n_classes = 2 # boundary or not
+ nums = [0 for i in range(n_classes)]
+ for i in range(len(info)):
+ video_name = info[i]
+ file_name = video_name.split('.')[0] + ".npy"
+ label_file_path = os.path.join(self.label_path, file_name)
+ label = np.load(label_file_path).astype(np.int64)
+ num, cnt = np.unique(label, return_counts=True)
+ for n, c in zip(num, cnt):
+ nums[n] += c
+
+ pos_ratio = nums[1] / sum(nums)
+ pos_weight = 1 / pos_ratio
+
+ if norm is not None:
+ pos_weight /= norm
+
+ return paddle.to_tensor(pos_weight, dtype="float32")
+
+ def forward(self, preds, gts):
+ """
+ Args:
+ preds: paddle.float (N, 1, T).
+ gts: paddle.float (N, 1, T).
+ """
+ loss = 0.0
+ batch_size = float(preds.shape[0])
+
+ for criterion in self.criterions:
+ for pred, gt in zip(preds, gts):
+ loss += criterion(pred, gt)
+
+ return loss / batch_size
+
+
+@LOSSES.register()
+class ASRFLoss(nn.Layer):
+
+ def __init__(self,
+ lambda_bound_loss,
+ num_classes,
+ file_path,
+ label_path,
+ boundary_path,
+ ce=True,
+ asl_focal=True,
+ tmse=False,
+ gstmse=False,
+ asl_weight=None,
+ threshold=4.,
+ ignore_index=255,
+ ce_weight=1.0,
+ focal_weight=1.0,
+ tmse_weight=0.15,
+ gstmse_weight=0.15,
+ bce=True,
+ brl_focal=False,
+ mse=False,
+ brl_weight=None):
+ super().__init__()
+ self.criterion_cls = ActionSegmentationLoss(ce=ce,
+ focal=asl_focal,
+ tmse=tmse,
+ gstmse=gstmse,
+ weight=asl_weight,
+ threshold=threshold,
+ ignore_index=ignore_index,
+ ce_weight=ce_weight,
+ focal_weight=focal_weight,
+ tmse_weight=tmse_weight,
+ gstmse_weight=gstmse_weight,
+ file_path=file_path,
+ label_path=label_path,
+ num_classes=num_classes)
+ self.criterion_boundary = BoundaryRegressionLoss(
+ bce=bce,
+ focal=brl_focal,
+ mse=mse,
+ weight=brl_weight,
+ file_path=file_path,
+ label_path=boundary_path)
+ self.lambda_bound_loss = lambda_bound_loss
+
+ def forward(self, x, output_cls, label, outputs_boundary, boundary):
+ loss = 0.0
+ if isinstance(output_cls, list):
+ n = len(output_cls)
+ for out in output_cls:
+ loss += self.criterion_cls(out, label, x) / n
+ else:
+ loss += self.criterion_cls(output_cls, label, x)
+
+ if isinstance(outputs_boundary, list):
+ n = len(outputs_boundary)
+ for out in outputs_boundary:
+ loss += self.lambda_bound_loss * self.criterion_boundary(
+ out, boundary) / n
+ else:
+ loss += self.lambda_bound_loss * self.criterion_boundary(
+ outputs_boundary, boundary)
+
+ return loss
diff --git a/paddlevideo/modeling/losses/base.py b/paddlevideo/modeling/losses/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..7284252e6c9d6de4b4cdb29b8eb50058b859751b
--- /dev/null
+++ b/paddlevideo/modeling/losses/base.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+import paddle
+import paddle.nn as nn
+
+#XXX use _forward?? or forward??
+class BaseWeightedLoss(nn.Layer):
+ """Base class for loss.
+
+ All subclass should overwrite the ``_forward()`` method which returns the
+ normal loss without loss weights.
+
+ Args:
+ loss_weight (float): Factor scalar multiplied on the loss.
+ Default: 1.0.
+ """
+
+ def __init__(self, loss_weight=1.0):
+ super().__init__()
+ self.loss_weight = loss_weight
+
+ @abstractmethod
+ def _forward(self, *args, **kwargs):
+ pass
+
+ def forward(self, *args, **kwargs):
+ """Defines the computation performed at every call.
+ Args:
+ *args: The positional arguments for the corresponding
+ loss.
+ **kwargs: The keyword arguments for the corresponding
+ loss.
+ Returns:
+ paddle.Tensor: The calculated loss.
+ """
+ return self._forward(*args, **kwargs) * self.loss_weight
diff --git a/paddlevideo/modeling/losses/bmn_loss.py b/paddlevideo/modeling/losses/bmn_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4348501349967c94e0f7f2923bcdf0f73b75e02
--- /dev/null
+++ b/paddlevideo/modeling/losses/bmn_loss.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class BMNLoss(BaseWeightedLoss):
+ """Loss for BMN model
+ Args:
+ tscale (int): sequence length, default 100.
+ dscale (int): max duration length, default 100.
+ """
+ def __init__(self, dscale, tscale):
+ super().__init__()
+ self.dscale = dscale
+ self.tscale = tscale
+
+ def _get_mask(self, dscale, tscale):
+ bm_mask = []
+ for idx in range(dscale):
+ mask_vector = [1 for i in range(tscale - idx)
+ ] + [0 for i in range(idx)]
+ bm_mask.append(mask_vector)
+ bm_mask = np.array(bm_mask, dtype='float32')
+ bm_mask = paddle.to_tensor(bm_mask)
+ bm_mask.stop_gradient = True
+ return bm_mask
+
+ def tem_loss_func(self, pred_start, pred_end, gt_start, gt_end):
+ def bi_loss(pred_score, gt_label, datatype):
+ pred_score = paddle.reshape(x=pred_score, shape=[-1])
+ gt_label = paddle.reshape(x=gt_label, shape=[-1])
+ gt_label.stop_gradient = True
+ pmask = paddle.cast(x=(gt_label > 0.5), dtype=datatype)
+ num_entries = paddle.cast(paddle.shape(pmask), dtype=datatype)
+ num_positive = paddle.cast(paddle.sum(pmask), dtype=datatype)
+ ratio = num_entries / num_positive
+ coef_0 = 0.5 * ratio / (ratio - 1)
+ coef_1 = 0.5 * ratio
+ epsilon = 0.000001
+ loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)
+ loss_pos = coef_1 * paddle.mean(loss_pos)
+ loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),
+ (1.0 - pmask))
+ loss_neg = coef_0 * paddle.mean(loss_neg)
+ loss = -1 * (loss_pos + loss_neg)
+ return loss
+
+ loss_start = bi_loss(pred_start, gt_start, pred_start.dtype)
+ loss_end = bi_loss(pred_end, gt_end, pred_start.dtype)
+ loss = loss_start + loss_end
+ return loss
+
+ def pem_reg_loss_func(self, pred_score, gt_iou_map, mask):
+ gt_iou_map = paddle.multiply(gt_iou_map, mask)
+
+ u_hmask = paddle.cast(x=gt_iou_map > 0.7, dtype=pred_score.dtype)
+ u_mmask = paddle.logical_and(gt_iou_map <= 0.7, gt_iou_map > 0.3)
+ u_mmask = paddle.cast(x=u_mmask, dtype=pred_score.dtype)
+ u_lmask = paddle.logical_and(gt_iou_map <= 0.3, gt_iou_map >= 0.)
+ u_lmask = paddle.cast(x=u_lmask, dtype=pred_score.dtype)
+ u_lmask = paddle.multiply(u_lmask, mask)
+
+ num_h = paddle.cast(paddle.sum(u_hmask), dtype=pred_score.dtype)
+ num_m = paddle.cast(paddle.sum(u_mmask), dtype=pred_score.dtype)
+ num_l = paddle.cast(paddle.sum(u_lmask), dtype=pred_score.dtype)
+
+ r_m = num_h / num_m
+ u_smmask = paddle.uniform(shape=[
+ gt_iou_map.shape[1], gt_iou_map.shape[2]
+ ],
+ min=0.0,
+ max=1.0).astype(pred_score.dtype)
+ u_smmask = paddle.multiply(u_mmask, u_smmask)
+ u_smmask = paddle.cast(x=(u_smmask > (1. - r_m)),
+ dtype=pred_score.dtype)
+
+ r_l = num_h / num_l
+ u_slmask = paddle.uniform(shape=[
+ gt_iou_map.shape[1], gt_iou_map.shape[2]
+ ],
+ min=0.0,
+ max=1.0).astype(pred_score.dtype)
+ u_slmask = paddle.multiply(u_lmask, u_slmask)
+ u_slmask = paddle.cast(x=(u_slmask > (1. - r_l)),
+ dtype=pred_score.dtype)
+
+ weights = u_hmask + u_smmask + u_slmask
+ weights.stop_gradient = True
+ loss = F.square_error_cost(pred_score, gt_iou_map)
+ loss = paddle.multiply(loss, weights)
+ loss = 0.5 * paddle.sum(loss) / paddle.sum(weights)
+
+ return loss
+
+ def pem_cls_loss_func(self, pred_score, gt_iou_map, mask):
+ gt_iou_map = paddle.multiply(gt_iou_map, mask)
+ gt_iou_map.stop_gradient = True
+ pmask = paddle.cast(x=(gt_iou_map > 0.9), dtype=pred_score.dtype)
+ nmask = paddle.cast(x=(gt_iou_map <= 0.9), dtype=pred_score.dtype)
+ nmask = paddle.multiply(nmask, mask)
+
+ num_positive = paddle.sum(pmask)
+ num_entries = num_positive + paddle.sum(nmask)
+ ratio = num_entries / num_positive
+ coef_0 = 0.5 * ratio / (ratio - 1)
+ coef_1 = 0.5 * ratio
+ epsilon = 0.000001
+ loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)
+ loss_pos = coef_1 * paddle.sum(loss_pos)
+ loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),
+ nmask)
+ loss_neg = coef_0 * paddle.sum(loss_neg)
+ loss = -1 * (loss_pos + loss_neg) / num_entries
+ return loss
+
+ def forward(self, pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+ gt_end):
+ pred_bm_reg = paddle.squeeze(paddle.slice(pred_bm,
+ axes=[1],
+ starts=[0],
+ ends=[1]),
+ axis=[1])
+ pred_bm_cls = paddle.squeeze(paddle.slice(pred_bm,
+ axes=[1],
+ starts=[1],
+ ends=[2]),
+ axis=[1])
+
+ bm_mask = self._get_mask(self.dscale, self.tscale)
+
+ pem_reg_loss = self.pem_reg_loss_func(pred_bm_reg, gt_iou_map, bm_mask)
+ pem_cls_loss = self.pem_cls_loss_func(pred_bm_cls, gt_iou_map, bm_mask)
+
+ tem_loss = self.tem_loss_func(pred_start, pred_end, gt_start, gt_end)
+
+ loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss
+ return loss
diff --git a/paddlevideo/modeling/losses/cross_entropy_loss.py b/paddlevideo/modeling/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..953f77c0791ed12b1626f0e1089fa9fdab81c386
--- /dev/null
+++ b/paddlevideo/modeling/losses/cross_entropy_loss.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class CrossEntropyLoss(BaseWeightedLoss):
+ """Cross Entropy Loss."""
+ def _forward(self, score, labels, **kwargs):
+ """Forward function.
+ Args:
+ score (paddle.Tensor): The class score.
+ labels (paddle.Tensor): The ground truth labels.
+ kwargs: Any keyword argument to be used to calculate
+ CrossEntropy loss.
+ Returns:
+ loss (paddle.Tensor): The returned CrossEntropy loss.
+ """
+ loss = F.cross_entropy(score, labels, **kwargs)
+ return loss
diff --git a/paddlevideo/modeling/losses/depth_loss.py b/paddlevideo/modeling/losses/depth_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba9a2cb04efb697052f7412108520cb8707862b5
--- /dev/null
+++ b/paddlevideo/modeling/losses/depth_loss.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+def get_smooth_loss(disp, img):
+ """Computes the smoothness loss for a disparity image
+ The color image is used for edge-aware smoothness
+ """
+ grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
+ grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
+
+ grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),
+ 1,
+ keepdim=True)
+ grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),
+ 1,
+ keepdim=True)
+
+ grad_disp_x *= paddle.exp(-grad_img_x)
+ grad_disp_y *= paddle.exp(-grad_img_y)
+
+ return grad_disp_x.mean() + grad_disp_y.mean()
+
+
+class DiffLoss(nn.Layer):
+ def __init__(self):
+ super(DiffLoss, self).__init__()
+
+ def forward(self, input1, input2):
+ batch_size = input1.shape[0]
+ input1 = input1.reshape([batch_size, -1])
+ input2 = input2.reshape([batch_size, -1])
+
+ input1_l2 = input1
+ input2_l2 = input2
+
+ diff_loss = 0
+ dim = input1.shape[1]
+ for i in range(input1.shape[0]):
+ diff_loss = diff_loss + paddle.mean(
+ ((input1_l2[i:i + 1, :].mm(input2_l2[i:i + 1, :].T)).pow(2)) /
+ dim)
+
+ diff_loss = diff_loss / input1.shape[0]
+
+ return diff_loss
+
+
+class MSE(nn.Layer):
+ def __init__(self):
+ super(MSE, self).__init__()
+
+ def forward(self, pred, real):
+ diffs = paddle.add(real, -pred)
+ n = paddle.numel(diffs)
+ mse = paddle.sum(diffs.pow(2)) / n
+
+ return mse
+
+
+class SIMSE(nn.Layer):
+ def __init__(self):
+ super(SIMSE, self).__init__()
+
+ def forward(self, pred, real):
+ diffs = paddle.add(real, -pred)
+ n = paddle.numel(diffs)
+ simse = paddle.sum(diffs).pow(2) / (n**2)
+
+ return simse
+
+
+class SSIM(nn.Layer):
+ """Layer to compute the SSIM loss between a pair of images
+ """
+ def __init__(self):
+ super(SSIM, self).__init__()
+ self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+ self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)
+
+ self.refl = nn.Pad2D(1, mode='reflect')
+
+ self.C1 = 0.01**2
+ self.C2 = 0.03**2
+
+ def forward(self, x, y):
+ x = self.refl(x)
+ y = self.refl(y)
+
+ mu_x = self.mu_x_pool(x)
+ mu_y = self.mu_y_pool(y)
+
+ sigma_x = self.sig_x_pool(x**2) - mu_x**2
+ sigma_y = self.sig_y_pool(y**2) - mu_y**2
+ sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
+
+ SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
+ SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)
+
+ return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)
+
+
+@LOSSES.register()
+class ADDSLoss(BaseWeightedLoss):
+ def __init__(self, avg_reprojection, disparity_smoothness, no_ssim):
+ super(ADDSLoss, self).__init__()
+ self.avg_reprojection = avg_reprojection
+ self.disparity_smoothness = disparity_smoothness
+ self.no_ssim = no_ssim
+
+ self.loss_diff = DiffLoss()
+ self.loss_recon1 = MSE()
+ self.loss_recon2 = SIMSE()
+ self.loss_similarity = MSE()
+
+ def compute_reprojection_loss(self, pred, target):
+ """Computes reprojection loss between a batch of predicted and target images
+ """
+ abs_diff = paddle.abs(target - pred)
+ l1_loss = abs_diff.mean(1, True)
+
+ if not self.no_ssim:
+ self.ssim = SSIM()
+
+ if self.no_ssim:
+ reprojection_loss = l1_loss
+ else:
+ ssim_loss = self.ssim(pred, target).mean(1, True)
+ reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss
+
+ return reprojection_loss
+
+ def compute_losses(self, inputs, outputs, is_night):
+ """Compute the reprojection and smoothness losses for a minibatch
+ """
+ losses = {}
+ total_loss = 0
+
+ for scale in outputs['scales']:
+ loss = 0
+ reprojection_losses = []
+
+ source_scale = 0
+
+ disp = outputs[("disp", scale)]
+ if is_night:
+ color = inputs[("color_n", 0, scale)]
+ target = inputs[("color_n", 0, source_scale)]
+ else:
+ color = inputs[("color", 0, scale)]
+ target = inputs[("color", 0, source_scale)]
+
+ for frame_id in outputs['frame_ids'][1:]:
+ pred = outputs[("color", frame_id, scale)]
+ reprojection_losses.append(
+ self.compute_reprojection_loss(pred, target))
+
+ reprojection_losses = paddle.concat(reprojection_losses, 1)
+
+ identity_reprojection_losses = []
+ for frame_id in outputs['frame_ids'][1:]:
+ if is_night:
+ pred = inputs[("color_n", frame_id, source_scale)]
+ else:
+ pred = inputs[("color", frame_id, source_scale)]
+ identity_reprojection_losses.append(
+ self.compute_reprojection_loss(pred, target))
+
+ identity_reprojection_losses = paddle.concat(
+ identity_reprojection_losses, 1)
+
+ if self.avg_reprojection:
+ identity_reprojection_loss = identity_reprojection_losses.mean(
+ 1, keepdim=True)
+ else:
+ # save both images, and do min all at once below
+ identity_reprojection_loss = identity_reprojection_losses
+
+ if self.avg_reprojection:
+ reprojection_loss = reprojection_losses.mean(1, keepdim=True)
+ else:
+ reprojection_loss = reprojection_losses
+
+ # add random numbers to break ties
+ identity_reprojection_loss = identity_reprojection_loss + paddle.randn(
+ identity_reprojection_loss.shape) * 0.00001
+
+ combined = paddle.concat(
+ (identity_reprojection_loss, reprojection_loss), axis=1)
+ if combined.shape[1] == 1:
+ to_optimise = combined
+ else:
+ to_optimise = paddle.min(combined, axis=1)
+
+ loss = loss + to_optimise.mean()
+
+ mean_disp = disp.mean(2, True).mean(3, True)
+ norm_disp = disp / (mean_disp + 1e-7)
+ smooth_loss = get_smooth_loss(norm_disp, color)
+
+ loss = loss + self.disparity_smoothness * smooth_loss / (2**scale)
+ total_loss = total_loss + loss
+ losses["loss/{}".format(scale)] = loss
+
+ total_loss /= len(outputs['scales'])
+ losses["loss"] = total_loss
+ return losses
+
+ def forward(self, inputs, outputs):
+
+ losses_day = self.compute_losses(inputs, outputs, 'day')
+ losses_night = self.compute_losses(inputs, outputs['outputs_night'],
+ 'night')
+
+ loss = 0
+ losses = []
+ # diff
+ target_diff1 = 0.5 * self.loss_diff(
+ outputs['result'][0], outputs['result'][2]) # 10 when batchsize=1
+ target_diff2 = 0.5 * self.loss_diff(outputs['result_night'][0],
+ outputs['result_night'][2])
+ losses.append(target_diff1)
+ losses.append(target_diff2)
+ loss = loss + target_diff1
+ loss = loss + target_diff2
+
+ target_diff3 = 1 * self.loss_diff(
+ outputs['result'][1], outputs['result'][3]) # 10 when batchsize=1
+ target_diff4 = 1 * self.loss_diff(outputs['result_night'][1],
+ outputs['result_night'][3])
+ losses.append(target_diff3)
+ losses.append(target_diff4)
+ loss = loss + target_diff3
+ loss = loss + target_diff4
+
+ # recon
+ target_mse = 1 * self.loss_recon1(outputs['result'][5],
+ inputs["color_aug", 0, 0])
+ loss = loss + target_mse
+
+ target_simse = 1 * self.loss_recon2(outputs['result'][5],
+ inputs["color_aug", 0, 0])
+ loss = loss + target_simse
+
+ losses.append(target_mse)
+ losses.append(target_simse)
+ target_mse_night = 1 * self.loss_recon1(outputs['result_night'][5],
+ inputs["color_n_aug", 0, 0])
+ loss = loss + target_mse_night
+
+ target_simse_night = 1 * self.loss_recon2(outputs['result_night'][5],
+ inputs["color_n_aug", 0, 0])
+ loss = loss + target_simse_night
+
+ losses.append(target_mse_night)
+ losses.append(target_simse_night)
+
+ # depth loss
+ pseudo_label = outputs[("disp", 0)].detach()
+ depth_loss = 1 * self.loss_similarity(
+ outputs['outputs_night'][("disp", 0)], pseudo_label)
+ loss = loss + depth_loss
+
+ losses.append(depth_loss)
+
+ outputs['loss'] = loss + losses_day['loss'] + losses_night['loss']
+ outputs['losses_day'] = losses_day['loss']
+ outputs['losses_night'] = losses_night['loss']
+
+ return outputs
diff --git a/paddlevideo/modeling/losses/transnetv2_loss.py b/paddlevideo/modeling/losses/transnetv2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..624c46852857e714009a033d52933f66cb1f95c7
--- /dev/null
+++ b/paddlevideo/modeling/losses/transnetv2_loss.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class TransNetV2Loss(BaseWeightedLoss):
+ """Loss for TransNetV2 model
+ """
+ def __init__(self, transition_weight=5.0, many_hot_loss_weight=0.1):
+ self.transition_weight = transition_weight
+ self.many_hot_loss_weight = many_hot_loss_weight
+ super().__init__()
+
+ def _forward(self, one_hot_pred, one_hot_gt,
+ many_hot_pred=None, many_hot_gt=None, reg_losses=None):
+ assert transition_weight != 1
+
+ one_hot_pred = one_hot_pred[:, :, 0]
+
+ one_hot_gt = one_hot_gt.astype('float32')
+ one_hot_loss = F.binary_cross_entropy_with_logits(logit=one_hot_pred, label=one_hot_gt, reduction='none')
+
+ one_hot_loss *= 1 + one_hot_gt * (transition_weight - 1)
+
+ one_hot_loss = paddle.mean(one_hot_loss)
+
+ many_hot_loss = 0.
+ if many_hot_loss_weight != 0. and many_hot_pred is not None:
+ many_hot_loss = many_hot_loss_weight * paddle.mean(
+ F.binary_cross_entropy_with_logits(logit=many_hot_pred[:, :, 0],
+ label=many_hot_gt.astype('float32'), reduction='none'))
+
+ total_loss = one_hot_loss + many_hot_loss
+
+ if reg_losses is not None:
+ for name, value in reg_losses.items():
+ if value is not None:
+ total_loss += value
+
+ return total_loss
\ No newline at end of file
diff --git a/paddlevideo/modeling/registry.py b/paddlevideo/modeling/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8140e1c2ede0cd3bf08c2f8108dcbe48bcf9f2d
--- /dev/null
+++ b/paddlevideo/modeling/registry.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+BACKBONES = Registry('backbone')
+HEADS = Registry('head')
+RECOGNIZERS = Registry('recognizer')
+SEGMENTERS = Registry('Segmenters')
+LOCALIZERS = Registry('localizer')
+PARTITIONERS = Registry('partitioner')
+LOSSES = Registry('loss')
+ROI_EXTRACTORS = Registry('roi_extractor')
+DETECTORS = Registry('detectors')
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+ESTIMATORS = Registry('estimator')
+MULTIMODAL = Registry('multimodal')
+SEGMENT = Registry('segment')
diff --git a/paddlevideo/modeling/samplers/__init__.py b/paddlevideo/modeling/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf7f15e5fe624698fdca9751b312187a4999a64
--- /dev/null
+++ b/paddlevideo/modeling/samplers/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .random_sampler import RandomSampler
+
+__all__ = ['RandomSampler']
diff --git a/paddlevideo/modeling/samplers/random_sampler.py b/paddlevideo/modeling/samplers/random_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9f417886e83cb0a81ef370141ed0f8ca64c429
--- /dev/null
+++ b/paddlevideo/modeling/samplers/random_sampler.py
@@ -0,0 +1,146 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import numpy as np
+from ..registry import BBOX_SAMPLERS
+
+class SamplingResult():
+ """Bbox sampling result. """
+
+ def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+ gt_flags):
+ self.pos_inds = pos_inds
+ self.neg_inds = neg_inds
+ self.pos_bboxes = paddle.index_select(bboxes,pos_inds)
+
+ # neg_inds may be empty
+ if neg_inds.shape[0]!=0:
+ self.neg_bboxes = paddle.index_select(bboxes,neg_inds)
+ else:
+ self.neg_bboxes=None
+
+ self.pos_is_gt = paddle.index_select(gt_flags,pos_inds)
+ self.num_gts = gt_bboxes.shape[0]
+ self.pos_assigned_gt_inds = paddle.index_select(assign_result.gt_inds,pos_inds) - 1
+
+ if gt_bboxes.numel().numpy()[0] == 0:
+ assert self.pos_assigned_gt_inds.numel() == 0
+ self.pos_gt_bboxes = paddle.empty_like(gt_bboxes).view(-1, 4)
+ else:
+ if len(gt_bboxes.shape) < 2:
+ gt_bboxes = gt_bboxes.view(-1, 4)
+
+ self.pos_gt_bboxes = paddle.index_select(gt_bboxes, self.pos_assigned_gt_inds)
+
+ if assign_result.labels is not None:
+ self.pos_gt_labels = paddle.index_select(assign_result.labels, pos_inds)
+ else:
+ self.pos_gt_labels = None
+
+ @property
+ def bboxes(self):
+ if self.neg_bboxes is not None:
+ ret = paddle.concat([self.pos_bboxes, self.neg_bboxes])
+ else:
+ # neg bbox may be empty
+ ret = self.pos_bboxes
+ return ret
+
+
+
+@BBOX_SAMPLERS.register()
+class RandomSampler():
+ def __init__(self,
+ num,
+ pos_fraction,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True,
+ **kwargs):
+ self.num = num
+ self.pos_fraction = pos_fraction
+ self.neg_pos_ub = neg_pos_ub
+ self.add_gt_as_proposals = add_gt_as_proposals
+
+ def sample(self,
+ assign_result,
+ bboxes,
+ gt_bboxes,
+ gt_labels=None,
+ **kwargs):
+ """Sample positive and negative bboxes. """
+
+ if len(bboxes.shape) < 2:
+ bboxes = bboxes[None, :]
+
+ bboxes = bboxes[:, :4]
+
+ gt_flags = paddle.full([bboxes.shape[0], ], 0, dtype='int32')
+ if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+ if gt_labels is None:
+ raise ValueError(
+ 'gt_labels must be given when add_gt_as_proposals is True')
+ bboxes = paddle.concat([gt_bboxes, bboxes])
+ assign_result.add_gt_(gt_labels)
+ gt_ones = paddle.full([gt_bboxes.shape[0], ], 1, dtype='int32')
+ gt_flags = paddle.concat([gt_ones, gt_flags])
+
+ #1. 得到正样本的数量, inds
+ num_expected_pos = int(self.num * self.pos_fraction)
+ pos_inds = self._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+ pos_inds = paddle.to_tensor(np.unique(pos_inds.numpy()))
+
+ #2. 得到负样本的数量, inds
+ num_sampled_pos = pos_inds.numel()
+ num_expected_neg = self.num - num_sampled_pos
+ neg_inds = self._sample_neg(
+ assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+ neg_inds = paddle.to_tensor(np.unique(neg_inds.numpy()))
+
+ #3. 得到sampling result
+ sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+ assign_result, gt_flags)
+ return sampling_result
+ def random_choice(self, gallery, num):
+ """Random select some elements from the gallery. """
+ assert len(gallery) >= num
+
+ perm = paddle.arange(gallery.numel())[:num]
+ perm = paddle.randperm(gallery.numel())[:num]
+ rand_inds = paddle.index_select(gallery, perm)
+ return rand_inds
+
+ def _sample_pos(self, assign_result, num_expected, **kwargs):
+ """Randomly sample some positive samples."""
+ #1.首先看一下给的bboxes里面有哪些label是大于0的 得到了他们的index
+ pos_inds = paddle.nonzero(assign_result.gt_inds, as_tuple=False)
+
+ #2. 只要这个pos_inds的数目不是0个 这些就都可以是positive sample
+ # 当pos_inds的数目小于num_expected(想要的sample的最大数目), 就直接用这个pos_inds
+ # 反之就从这么多index里随机采样num_expected个出来
+ if pos_inds.numel().numpy()[0] != 0:
+ pos_inds = pos_inds.squeeze()
+ if pos_inds.numel().numpy()[0] <= num_expected:
+ return pos_inds
+ else:
+ return self.random_choice(pos_inds, num_expected)
+
+ def _sample_neg(self, assign_result, num_expected, **kwargs):
+ """Randomly sample some negative samples."""
+ neg_inds = paddle.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+ if neg_inds.numel().numpy()[0] != 0:
+ neg_inds = neg_inds.squeeze()
+ if (neg_inds.numel().numpy()[0]) <= num_expected.numpy()[0]:
+ return neg_inds
+ else:
+ return self.random_choice(neg_inds, num_expected)
diff --git a/paddlevideo/modeling/weight_init.py b/paddlevideo/modeling/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..4722895265b3bfa1962e959de9dfdbe3ced6d1fc
--- /dev/null
+++ b/paddlevideo/modeling/weight_init.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.initializer as init
+import numpy as np
+from scipy import special
+
+
+def weight_init_(layer,
+ func,
+ weight_name=None,
+ bias_name=None,
+ bias_value=0.0,
+ **kwargs):
+ """
+ In-place params init function.
+ Usage:
+ .. code-block:: python
+
+ import paddle
+ import numpy as np
+
+ data = np.ones([3, 4], dtype='float32')
+ linear = paddle.nn.Linear(4, 4)
+ input = paddle.to_tensor(data)
+ print(linear.weight)
+ linear(input)
+
+ weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)
+ print(linear.weight)
+ """
+
+ if hasattr(layer, 'weight') and layer.weight is not None:
+ getattr(init, func)(**kwargs)(layer.weight)
+ if weight_name is not None:
+ # override weight name
+ layer.weight.name = weight_name
+
+ if hasattr(layer, 'bias') and layer.bias is not None:
+ init.Constant(bias_value)(layer.bias)
+ if bias_name is not None:
+ # override bias name
+ layer.bias.name = bias_name
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+ def norm_cdf(x):
+ # Computes standard normal cumulative distribution function
+ return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
+ print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+ "The distribution of values may be incorrect.")
+
+ with paddle.no_grad():
+ # Values are generated by using a truncated uniform distribution and
+ # then using the inverse CDF for the normal distribution.
+ # Get upper and lower cdf values
+ l = norm_cdf((a - mean) / std)
+ u = norm_cdf((b - mean) / std)
+
+ # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].
+ tmp = np.random.uniform(2 * l - 1, 2 * u - 1,
+ size=list(tensor.shape)).astype(np.float32)
+
+ # Use inverse cdf transform for normal distribution to get truncated
+ # standard normal
+ tmp = special.erfinv(tmp)
+
+ # Transform to proper mean, std
+ tmp *= (std * math.sqrt(2.0))
+ tmp += mean
+
+ # Clamp to ensure it's in the proper range
+ tmp = np.clip(tmp, a, b)
+ tensor.set_value(paddle.to_tensor(tmp))
+
+ return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+ dimensions = tensor.dim()
+ if dimensions < 2:
+ raise ValueError(
+ "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+ )
+
+ num_input_fmaps = tensor.shape[1]
+ num_output_fmaps = tensor.shape[0]
+ receptive_field_size = 1
+ if tensor.dim() > 2:
+ receptive_field_size = tensor[0][0].numel()
+ fan_in = num_input_fmaps * receptive_field_size
+ fan_out = num_output_fmaps * receptive_field_size
+
+ return fan_in, fan_out
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+ return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):
+ def _calculate_correct_fan(tensor, mode):
+ mode = mode.lower()
+ valid_modes = ['fan_in', 'fan_out']
+ if mode not in valid_modes:
+ raise ValueError(
+ "Mode {} not supported, please use one of {}".format(
+ mode, valid_modes))
+
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+ return fan_in if mode == 'fan_in' else fan_out
+
+ def calculate_gain(nonlinearity, param=None):
+ linear_fns = [
+ 'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+ 'conv_transpose2d', 'conv_transpose3d'
+ ]
+ if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+ return 1
+ elif nonlinearity == 'tanh':
+ return 5.0 / 3
+ elif nonlinearity == 'relu':
+ return math.sqrt(2.0)
+ elif nonlinearity == 'leaky_relu':
+ if param is None:
+ negative_slope = 0.01
+ elif not isinstance(param, bool) and isinstance(
+ param, int) or isinstance(param, float):
+ negative_slope = param
+ else:
+ raise ValueError(
+ "negative_slope {} not a valid number".format(param))
+ return math.sqrt(2.0 / (1 + negative_slope**2))
+ else:
+ raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+ fan = _calculate_correct_fan(tensor, mode)
+ gain = calculate_gain(nonlinearity, a)
+ std = gain / math.sqrt(fan)
+ with paddle.no_grad():
+ paddle.nn.initializer.Normal(0, std)(tensor)
+ return tensor
diff --git a/paddlevideo/solver/__init__.py b/paddlevideo/solver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..01cf9cdd763493911412d8943bf19066dd52e88b
--- /dev/null
+++ b/paddlevideo/solver/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import build_optimizer
+from .lr import build_lr
diff --git a/paddlevideo/solver/custom_lr.py b/paddlevideo/solver/custom_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbf8d742af7ac9387ddccca70efca68e3a4f7f57
--- /dev/null
+++ b/paddlevideo/solver/custom_lr.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from paddle.optimizer.lr import *
+import numpy as np
+"""
+PaddleVideo Learning Rate Schedule:
+You can use paddle.optimizer.lr
+or define your custom_lr in this file.
+"""
+
+
+class CustomWarmupCosineDecay(LRScheduler):
+ r"""
+ We combine warmup and stepwise-cosine which is used in slowfast model.
+
+ Args:
+ warmup_start_lr (float): start learning rate used in warmup stage.
+ warmup_epochs (int): the number epochs of warmup.
+ cosine_base_lr (float|int, optional): base learning rate in cosine schedule.
+ max_epoch (int): total training epochs.
+ num_iters(int): number iterations of each epoch.
+ last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+ verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+ Returns:
+ ``CosineAnnealingDecay`` instance to schedule learning rate.
+ """
+
+ def __init__(self,
+ warmup_start_lr,
+ warmup_epochs,
+ cosine_base_lr,
+ max_epoch,
+ num_iters,
+ last_epoch=-1,
+ verbose=False):
+ self.warmup_start_lr = warmup_start_lr
+ self.warmup_epochs = warmup_epochs
+ self.cosine_base_lr = cosine_base_lr
+ self.max_epoch = max_epoch
+ self.num_iters = num_iters
+ #call step() in base class, last_lr/last_epoch/base_lr will be update
+ super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,
+ verbose=verbose)
+
+ def step(self, epoch=None):
+ """
+ ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+ The new learning rate will take effect on next ``optimizer.step`` .
+ Args:
+ epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+ Returns:
+ None
+ """
+ if epoch is None:
+ if self.last_epoch == -1:
+ self.last_epoch += 1
+ else:
+ self.last_epoch += 1 / self.num_iters # update step with iters
+ else:
+ self.last_epoch = epoch
+ self.last_lr = self.get_lr()
+
+ if self.verbose:
+ print('Epoch {}: {} set learning rate to {}.'.format(
+ self.last_epoch, self.__class__.__name__, self.last_lr))
+
+ def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):
+ return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +
+ 1.0) * 0.5
+
+ def get_lr(self):
+ """Define lr policy"""
+ lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,
+ self.max_epoch)
+ lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,
+ self.max_epoch)
+
+ # Perform warm up.
+ if self.last_epoch < self.warmup_epochs:
+ lr_start = self.warmup_start_lr
+ alpha = (lr_end - lr_start) / self.warmup_epochs
+ lr = self.last_epoch * alpha + lr_start
+ return lr
+
+
+class CustomWarmupPiecewiseDecay(LRScheduler):
+ r"""
+ This op combine warmup and stepwise-cosine which is used in slowfast model.
+
+ Args:
+ warmup_start_lr (float): start learning rate used in warmup stage.
+ warmup_epochs (int): the number epochs of warmup.
+ step_base_lr (float|int, optional): base learning rate in step schedule.
+ max_epoch (int): total training epochs.
+ num_iters(int): number iterations of each epoch.
+ last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+ verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+ Returns:
+ ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.
+ """
+
+ def __init__(self,
+ warmup_start_lr,
+ warmup_epochs,
+ step_base_lr,
+ lrs,
+ gamma,
+ steps,
+ max_epoch,
+ num_iters,
+ last_epoch=0,
+ verbose=False):
+ self.warmup_start_lr = warmup_start_lr
+ self.warmup_epochs = warmup_epochs
+ self.step_base_lr = step_base_lr
+ self.lrs = lrs
+ self.gamma = gamma
+ self.steps = steps
+ self.max_epoch = max_epoch
+ self.num_iters = num_iters
+ self.last_epoch = last_epoch
+ self.last_lr = self.warmup_start_lr # used in first iter
+ self.verbose = verbose
+ self._var_name = None
+
+ def step(self, epoch=None, rebuild=False):
+ """
+ ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+ The new learning rate will take effect on next ``optimizer.step`` .
+ Args:
+ epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+ Returns:
+ None
+ """
+ if epoch is None:
+ if not rebuild:
+ self.last_epoch += 1 / self.num_iters # update step with iters
+ else:
+ self.last_epoch = epoch
+ self.last_lr = self.get_lr()
+
+ if self.verbose:
+ print(
+ 'step Epoch {}: {} set learning rate to {}.self.num_iters={}, 1/self.num_iters={}'
+ .format(self.last_epoch, self.__class__.__name__, self.last_lr,
+ self.num_iters, 1 / self.num_iters))
+
+ def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,
+ max_epoch):
+ # get step index
+ steps = steps + [max_epoch]
+ for ind, step in enumerate(steps):
+ if cur_epoch < step:
+ break
+ if self.verbose:
+ print(
+ '_lr_func_steps_with_relative_lrs, cur_epoch {}: {}, steps {}, ind {}, step{}, max_epoch{}'
+ .format(cur_epoch, self.__class__.__name__, steps, ind, step,
+ max_epoch))
+
+ return lrs[ind - 1] * base_lr
+
+ def get_lr(self):
+ """Define lr policy"""
+ lr = self._lr_func_steps_with_relative_lrs(
+ self.last_epoch,
+ self.lrs,
+ self.step_base_lr,
+ self.steps,
+ self.max_epoch,
+ )
+ lr_end = self._lr_func_steps_with_relative_lrs(
+ self.warmup_epochs,
+ self.lrs,
+ self.step_base_lr,
+ self.steps,
+ self.max_epoch,
+ )
+
+ # Perform warm up.
+ if self.last_epoch < self.warmup_epochs:
+ lr_start = self.warmup_start_lr
+ alpha = (lr_end - lr_start) / self.warmup_epochs
+ lr = self.last_epoch * alpha + lr_start
+ if self.verbose:
+ print(
+ 'get_lr, Epoch {}: {}, lr {}, lr_end {}, self.lrs{}, self.step_base_lr{}, self.steps{}, self.max_epoch{}'
+ .format(self.last_epoch, self.__class__.__name__, lr, lr_end,
+ self.lrs, self.step_base_lr, self.steps,
+ self.max_epoch))
+
+ return lr
+
+
+class CustomPiecewiseDecay(PiecewiseDecay):
+
+ def __init__(self, **kargs):
+ kargs.pop('num_iters')
+ super().__init__(**kargs)
+
+
+class CustomWarmupCosineStepDecay(LRScheduler):
+
+ def __init__(self,
+ warmup_iters,
+ warmup_ratio=0.1,
+ min_lr=0,
+ base_lr=3e-5,
+ max_epoch=30,
+ last_epoch=-1,
+ num_iters=None,
+ verbose=False):
+
+ self.warmup_ratio = warmup_ratio
+ self.min_lr = min_lr
+ self.warmup_epochs = warmup_iters
+ self.warmup_iters = warmup_iters * num_iters
+ self.cnt_iters = 0
+ self.cnt_epoch = 0
+ self.num_iters = num_iters
+ self.tot_iters = max_epoch * num_iters
+ self.max_epoch = max_epoch
+ self.cosine_base_lr = base_lr # initial lr for all param groups
+ self.regular_lr = self.get_regular_lr()
+ super().__init__(last_epoch=last_epoch, verbose=verbose)
+
+ def annealing_cos(self, start, end, factor, weight=1):
+ cos_out = math.cos(math.pi * factor) + 1
+ return end + 0.5 * weight * (start - end) * cos_out
+
+ def get_regular_lr(self):
+ progress = self.cnt_epoch
+ max_progress = self.max_epoch
+ target_lr = self.min_lr
+ return self.annealing_cos(self.cosine_base_lr, target_lr, progress /
+ max_progress) # self.cosine_base_lr
+
+ def get_warmup_lr(self, cur_iters):
+ k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio)
+ warmup_lr = self.regular_lr * (1 - k) # 3e-5 * (1-k)
+ return warmup_lr
+
+ def step(self, epoch=None):
+ self.regular_lr = self.get_regular_lr()
+ self.last_lr = self.get_lr()
+ self.cnt_epoch = (self.cnt_iters +
+ 1) // self.num_iters # update step with iters
+ self.cnt_iters += 1
+
+ if self.verbose:
+ print('Epoch {}: {} set learning rate to {}.'.format(
+ self.last_epoch, self.__class__.__name__, self.last_lr))
+
+ def get_lr(self):
+ """Define lr policy"""
+ cur_iter = self.cnt_iters
+ if cur_iter >= self.warmup_iters:
+ return self.regular_lr
+ else:
+ warmup_lr = self.get_warmup_lr(cur_iter)
+ return warmup_lr
+
+
+class CustomWarmupAdjustDecay(LRScheduler):
+ r"""
+ We combine warmup and stepwise-cosine which is used in slowfast model.
+
+ Args:
+ step_base_lr (float): start learning rate used in warmup stage.
+ warmup_epochs (int): the number epochs of warmup.
+ lr_decay_rate (float|int, optional): base learning rate decay rate.
+ step (int): step in change learning rate.
+ last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+ verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+ Returns:
+ ``CosineAnnealingDecay`` instance to schedule learning rate.
+ """
+
+ def __init__(self,
+ step_base_lr,
+ warmup_epochs,
+ lr_decay_rate,
+ boundaries,
+ num_iters=None,
+ last_epoch=-1,
+ verbose=False):
+ self.step_base_lr = step_base_lr
+ self.warmup_epochs = warmup_epochs
+ self.lr_decay_rate = lr_decay_rate
+ self.boundaries = boundaries
+ self.num_iters = num_iters
+ #call step() in base class, last_lr/last_epoch/base_lr will be update
+ super(CustomWarmupAdjustDecay, self).__init__(last_epoch=last_epoch,
+ verbose=verbose)
+
+ def step(self, epoch=None):
+ """
+ ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+ The new learning rate will take effect on next ``optimizer.step`` .
+ Args:
+ epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+ Returns:
+ None
+ """
+ if epoch is None:
+ if self.last_epoch == -1:
+ self.last_epoch += 1
+ else:
+ self.last_epoch += 1 / self.num_iters # update step with iters
+ else:
+ self.last_epoch = epoch
+
+ self.last_lr = self.get_lr()
+
+ if self.verbose:
+ print('Epoch {}: {} set learning rate to {}.'.format(
+ self.last_epoch, self.__class__.__name__, self.last_lr))
+
+ def get_lr(self):
+ if self.last_epoch < self.warmup_epochs:
+ lr = self.step_base_lr * (self.last_epoch + 1) / self.warmup_epochs
+ else:
+ lr = self.step_base_lr * (self.lr_decay_rate**np.sum(
+ self.last_epoch >= np.array(self.boundaries)))
+ return lr
diff --git a/paddlevideo/solver/lr.py b/paddlevideo/solver/lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a56fad16e9ea33395e98998629b6fae958f7353
--- /dev/null
+++ b/paddlevideo/solver/lr.py
@@ -0,0 +1,52 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from paddle.optimizer.lr import LRScheduler
+
+from . import custom_lr
+
+
+def build_lr(cfg: Dict, num_iters: int) -> LRScheduler:
+ """Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.
+ In configuration:
+ learning_rate:
+ name: 'PiecewiseDecay'
+ boundaries: [20, 60]
+ values: [0.00025, 0.000025, 0.0000025]
+
+ Args:
+ cfg (Dict): learning rate configuration.
+ num_iters (int): The number of iterations that may be used when calculating the learning rate
+
+ Returns:
+ LRScheduler: learning rate scheduler.
+ """
+
+ cfg_copy = cfg.copy()
+
+ #when learning_rate is LRScheduler
+ if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],
+ dict):
+ cfg_copy['learning_rate'] = build_lr(
+ cfg_copy['learning_rate'],
+ num_iters) #not support only inner iter_step
+
+ lr_name = cfg_copy.pop('name')
+ if cfg_copy.get('iter_step'):
+ cfg_copy['num_iters'] = num_iters
+ cfg_copy.pop('iter_step')
+
+ return getattr(custom_lr, lr_name)(**cfg_copy)
diff --git a/paddlevideo/solver/optimizer.py b/paddlevideo/solver/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..46ff916c46b7dae925d67d97b792b7f5fd8aab46
--- /dev/null
+++ b/paddlevideo/solver/optimizer.py
@@ -0,0 +1,132 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Dict
+
+import paddle
+from paddle.optimizer.lr import LRScheduler
+from paddle.regularizer import L1Decay, L2Decay
+from paddlevideo.utils import get_logger
+
+
+def build_optimizer(cfg: Dict,
+ lr_scheduler: LRScheduler,
+ model: paddle.nn.Layer,
+ use_amp: bool = False,
+ amp_level: str = None) -> paddle.optimizer.Optimizer:
+ """Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration.
+
+ In configuration:
+ OPTIMIZER:
+ name: Momentum
+ momentum: 0.9
+ weight_decay: 0.001
+ or
+
+ OPTIMIZER:
+ name: Momentum
+ momentum: 0.9
+ weight_decay:
+ name: "L1"
+ value: 0.001
+
+ Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.
+
+ OPTIMIZER:
+ name: Adam
+ weight_decay:
+ name: "L2"
+ value: 0.001
+
+ Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.
+
+ Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.
+
+ Args:
+ cfg (Dict): optimizer configuration.
+ lr_scheduler (LRScheduler): learning rate scheduler.
+ model (paddle.nn.Layer, optional): model which contains parameters to be optimized. Defaults to None.
+ use_amp (bool, optional): Whether use amp. Defaults to False.
+ amp_level (str, optional): amp level when amp is enabled. Defaults to None.
+
+
+ Returns:
+ paddle.optimizer.Optimizer: an optimizer for the input model.
+ """
+ logger = get_logger("paddlevideo")
+ cfg_copy = cfg.copy()
+ # NOTE: check none and illegal cfg!!!
+ opt_name = cfg_copy.pop('name')
+ # deal with weight decay
+ if cfg_copy.get('weight_decay'):
+ if isinstance(cfg_copy.get('weight_decay'),
+ float): # just an float factor
+ cfg_copy['weight_decay'] = cfg_copy.get('weight_decay')
+ elif 'L1' in cfg_copy.get('weight_decay').get(
+ 'name').upper(): # specify L2 wd and it's float factor
+ cfg_copy['weight_decay'] = L1Decay(
+ cfg_copy.get('weight_decay').get('value'))
+ elif 'L2' in cfg_copy.get('weight_decay').get(
+ 'name').upper(): # specify L1 wd and it's float factor
+ cfg_copy['weight_decay'] = L2Decay(
+ cfg_copy.get('weight_decay').get('value'))
+ else:
+ raise ValueError
+
+ # deal with grad clip
+ if cfg_copy.get('grad_clip'):
+ if isinstance(cfg_copy.get('grad_clip'), float):
+ cfg_copy['grad_clip'] = cfg_copy.get('grad_clip').get('value')
+ elif 'global' in cfg_copy.get('grad_clip').get('name').lower():
+ cfg_copy['grad_clip'] = paddle.nn.ClipGradByGlobalNorm(
+ cfg_copy.get('grad_clip').get('value'))
+ else:
+ raise ValueError
+
+ # Set for optimizers that cannot be applied to l2decay, i.e. AdamW
+ if cfg_copy.get('no_weight_decay_name'):
+ no_weight_decay_name = cfg_copy.pop('no_weight_decay_name')
+ no_weight_decay_name_list = no_weight_decay_name.split(' ')
+
+ # NOTE: use param.name not name
+ no_weight_decay_param_list = [
+ param.name for name, param in model.named_parameters()
+ if any(key_word in name for key_word in no_weight_decay_name_list)
+ ] # get the full param name of no weight decay
+
+ _apply_decay_param_fun = lambda name: name not in no_weight_decay_param_list
+ cfg_copy['apply_decay_param_fun'] = _apply_decay_param_fun
+ logger.info(
+ f"No weight Decay list :({len(no_weight_decay_param_list)})",
+ no_weight_decay_param_list)
+
+ cfg_copy.pop('learning_rate')
+
+ # set multi_precision
+ optimizer_setting = {
+ 'learning_rate': lr_scheduler,
+ 'parameters': model.parameters(),
+ **cfg_copy
+ }
+ optimizer_init_args = inspect.getargspec(
+ getattr(paddle.optimizer, opt_name).__init__).args
+ if use_amp and amp_level == "O2" and "multi_precision" in optimizer_init_args:
+ # support "multi_precision" arg in optimizer's __init__ function.
+ optimizer_setting.update({"multi_precision": True})
+ logger.info(
+ "Set multi_precision=True for optimizer when use_amp=True and amp_level='O2'"
+ )
+
+ return getattr(paddle.optimizer, opt_name)(**optimizer_setting)
diff --git a/paddlevideo/tasks/__init__.py b/paddlevideo/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d43f0955d20ed75c1b3438503b7b56a1819b034
--- /dev/null
+++ b/paddlevideo/tasks/__init__.py
@@ -0,0 +1,20 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .train import train_model
+from .test import test_model
+from .train_dali import train_dali
+from .train_multigrid import train_model_multigrid
+
+__all__ = ['train_model', 'test_model', 'train_dali', 'train_model_multigrid']
diff --git a/paddlevideo/tasks/test.py b/paddlevideo/tasks/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b4d89cf00f9aafd8cc46cd7d8c330907aa8223f
--- /dev/null
+++ b/paddlevideo/tasks/test.py
@@ -0,0 +1,87 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddlevideo.utils import get_logger, load
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics import build_metric
+from ..modeling.builder import build_model
+
+logger = get_logger("paddlevideo")
+
+
+@paddle.no_grad()
+def test_model(cfg, weights, parallel=True):
+ """Test model entry
+
+ Args:
+ cfg (dict): configuration.
+ weights (str): weights path to load.
+ parallel (bool): Whether to do multi-cards testing. Default: True.
+
+ """
+ # 1. Construct model.
+ if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):
+ cfg.MODEL.backbone.pretrained = '' # disable pretrain model init
+ model = build_model(cfg.MODEL)
+
+ if parallel:
+ model = paddle.DataParallel(model)
+
+ # 2. Construct dataset and dataloader.
+ cfg.DATASET.test.test_mode = True
+ dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))
+ batch_size = cfg.DATASET.get("test_batch_size", 8)
+
+ if cfg.get('use_npu'):
+ places = paddle.set_device('npu')
+ else:
+ places = paddle.set_device('gpu')
+
+ # default num worker: 0, which means no subprocess will be created
+ num_workers = cfg.DATASET.get('num_workers', 0)
+ num_workers = cfg.DATASET.get('test_num_workers', num_workers)
+ dataloader_setting = dict(batch_size=batch_size,
+ num_workers=num_workers,
+ places=places,
+ drop_last=False,
+ shuffle=False)
+
+ data_loader = build_dataloader(
+ dataset, **dataloader_setting) if cfg.model_name not in ['CFBI'
+ ] else dataset
+
+ model.eval()
+
+ state_dicts = load(weights)
+ model.set_state_dict(state_dicts)
+
+ # add params to metrics
+ cfg.METRIC.data_size = len(dataset)
+ cfg.METRIC.batch_size = batch_size
+ Metric = build_metric(cfg.METRIC)
+
+ if cfg.MODEL.framework == "FastRCNN":
+ Metric.set_dataset_info(dataset.info, len(dataset))
+
+ for batch_id, data in enumerate(data_loader):
+ if cfg.model_name in [
+ 'CFBI'
+ ]: #for VOS task, dataset for video and dataloader for frames in each video
+ Metric.update(batch_id, data, model)
+ else:
+ outputs = model(data, mode='test')
+ Metric.update(batch_id, data, outputs)
+ Metric.accumulate()
diff --git a/paddlevideo/tasks/train.py b/paddlevideo/tasks/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..90da6efc4ee7ca0b4012406ada6729eb6bbb503f
--- /dev/null
+++ b/paddlevideo/tasks/train.py
@@ -0,0 +1,398 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import time
+
+import paddle
+import paddle.amp as amp
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddlevideo.utils import (add_profiler_step, build_record, get_logger,
+ load, log_batch, log_epoch, mkdir, save)
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics.ava_utils import collect_results_cpu
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+
+
+def train_model(cfg,
+ weights=None,
+ parallel=True,
+ validate=True,
+ use_amp=False,
+ amp_level=None,
+ max_iters=None,
+ use_fleet=False,
+ profiler_options=None):
+ """Train model entry
+
+ Args:
+ cfg (dict): configuration.
+ weights (str, optional): weights path for finetuning. Defaults to None.
+ parallel (bool, optional): whether multi-cards training. Defaults to True.
+ validate (bool, optional): whether to do evaluation. Defaults to True.
+ use_amp (bool, optional): whether to use automatic mixed precision during training. Defaults to False.
+ amp_level (str, optional): amp optmization level, must be 'O1' or 'O2' when use_amp is True. Defaults to None.
+ max_iters (int, optional): max running iters in an epoch. Defaults to None.
+ use_fleet (bool, optional): whether to use fleet. Defaults to False.
+ profiler_options (str, optional): configuration for the profiler function. Defaults to None.
+
+ """
+ if use_fleet:
+ fleet.init(is_collective=True)
+
+ logger = get_logger("paddlevideo")
+ batch_size = cfg.DATASET.get('batch_size', 8)
+ valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)
+
+ # gradient accumulation settings
+ use_gradient_accumulation = cfg.get('GRADIENT_ACCUMULATION', None)
+ if use_gradient_accumulation and dist.get_world_size() >= 1:
+ global_batch_size = cfg.GRADIENT_ACCUMULATION.get(
+ 'global_batch_size', None)
+ num_gpus = dist.get_world_size()
+
+ assert isinstance(
+ global_batch_size, int
+ ), f"global_batch_size must be int, but got {type(global_batch_size)}"
+ assert batch_size <= global_batch_size, \
+ f"global_batch_size({global_batch_size}) must not be less than batch_size({batch_size})"
+
+ cur_global_batch_size = batch_size * num_gpus # The number of batches calculated by all GPUs at one time
+ assert global_batch_size % cur_global_batch_size == 0, \
+ f"The global batchsize({global_batch_size}) must be divisible by cur_global_batch_size({cur_global_batch_size})"
+ cfg.GRADIENT_ACCUMULATION[
+ "num_iters"] = global_batch_size // cur_global_batch_size
+ # The number of iterations required to reach the global batchsize
+ logger.info(
+ f"Using gradient accumulation training strategy, "
+ f"global_batch_size={global_batch_size}, "
+ f"num_gpus={num_gpus}, "
+ f"num_accumulative_iters={cfg.GRADIENT_ACCUMULATION.num_iters}")
+
+ if cfg.get('use_npu'):
+ places = paddle.set_device('npu')
+ else:
+ places = paddle.set_device('gpu')
+
+ # default num worker: 0, which means no subprocess will be created
+ num_workers = cfg.DATASET.get('num_workers', 0)
+ valid_num_workers = cfg.DATASET.get('valid_num_workers', num_workers)
+ model_name = cfg.model_name
+ output_dir = cfg.get("output_dir", f"./output/{model_name}")
+ mkdir(output_dir)
+
+ # 1. Construct model
+ model = build_model(cfg.MODEL)
+
+ # 2. Construct dataset and dataloader for training and evaluation
+ train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))
+ train_dataloader_setting = dict(batch_size=batch_size,
+ num_workers=num_workers,
+ collate_fn_cfg=cfg.get('MIX', None),
+ places=places)
+ train_loader = build_dataloader(train_dataset, **train_dataloader_setting)
+
+ if validate:
+ valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))
+ validate_dataloader_setting = dict(
+ batch_size=valid_batch_size,
+ num_workers=valid_num_workers,
+ places=places,
+ drop_last=False,
+ shuffle=cfg.DATASET.get(
+ 'shuffle_valid',
+ False) # NOTE: attention_LSTM needs to shuffle valid data.
+ )
+ valid_loader = build_dataloader(valid_dataset,
+ **validate_dataloader_setting)
+
+ # 3. Construct learning rate scheduler(lr) and optimizer
+ lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+ optimizer = build_optimizer(cfg.OPTIMIZER,
+ lr,
+ model=model,
+ use_amp=use_amp,
+ amp_level=amp_level)
+
+ # 4. Construct scalar and convert parameters for amp(optional)
+ if use_amp:
+ scaler = amp.GradScaler(init_loss_scaling=2.0**16,
+ incr_every_n_steps=2000,
+ decr_every_n_nan_or_inf=1)
+ # convert model parameters to fp16 when amp_level is O2(pure fp16)
+ model, optimizer = amp.decorate(models=model,
+ optimizers=optimizer,
+ level=amp_level,
+ save_dtype='float32')
+ # NOTE: save_dtype is set to float32 now.
+ logger.info(f"Training in amp mode, amp_level={amp_level}.")
+ else:
+ assert amp_level is None, f"amp_level must be None when training in fp32 mode, but got {amp_level}."
+ logger.info("Training in fp32 mode.")
+
+ # 5. Resume(optional)
+ resume_epoch = cfg.get("resume_epoch", 0)
+ if resume_epoch:
+ filename = osp.join(output_dir,
+ model_name + f"_epoch_{resume_epoch:05d}")
+ resume_model_dict = load(filename + '.pdparams')
+ resume_opt_dict = load(filename + '.pdopt')
+ model.set_state_dict(resume_model_dict)
+ optimizer.set_state_dict(resume_opt_dict)
+ logger.info("Resume from checkpoint: {}".format(filename))
+
+ # 6. Finetune(optional)
+ if weights:
+ assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it."
+ model_dict = load(weights)
+ model.set_state_dict(model_dict)
+ logger.info("Finetune from checkpoint: {}".format(weights))
+
+ # 7. Parallelize(optional)
+ if parallel:
+ model = paddle.DataParallel(model)
+
+ if use_fleet:
+ model = fleet.distributed_model(model)
+ optimizer = fleet.distributed_optimizer(optimizer)
+
+ # 8. Train Model
+ best = 0.0
+ for epoch in range(0, cfg.epochs):
+ if epoch < resume_epoch:
+ logger.info(
+ f"| epoch: [{epoch + 1}] <= resume_epoch: [{resume_epoch}], continue..."
+ )
+ continue
+ model.train()
+
+ record_list = build_record(cfg.MODEL)
+ tic = time.time()
+ for i, data in enumerate(train_loader):
+ """Next two line of code only used in test_tipc,
+ ignore it most of the time"""
+ if max_iters is not None and i >= max_iters:
+ break
+
+ record_list['reader_time'].update(time.time() - tic)
+
+ # Collect performance information when profiler_options is activate
+ add_profiler_step(profiler_options)
+
+ # 8.1 forward
+ # AMP #
+ if use_amp:
+ with amp.auto_cast(custom_black_list={"reduce_mean"},
+ level=amp_level):
+ outputs = model(data, mode='train')
+ avg_loss = outputs['loss']
+ if use_gradient_accumulation:
+ # clear grad at when epoch begins
+ if i == 0:
+ optimizer.clear_grad()
+ # Loss normalization
+ avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters
+ # Loss scaling
+ scaled = scaler.scale(avg_loss)
+ # 8.2 backward
+ scaled.backward()
+ # 8.3 minimize
+ if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:
+ scaler.minimize(optimizer, scaled)
+ optimizer.clear_grad()
+ else: # general case
+ # Loss scaling
+ scaled = scaler.scale(avg_loss)
+ # 8.2 backward
+ scaled.backward()
+ # 8.3 minimize
+ scaler.minimize(optimizer, scaled)
+ optimizer.clear_grad()
+ else:
+ outputs = model(data, mode='train')
+ avg_loss = outputs['loss']
+ if use_gradient_accumulation:
+ # clear grad at when epoch begins
+ if i == 0:
+ optimizer.clear_grad()
+ # Loss normalization
+ avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters
+ # 8.2 backward
+ avg_loss.backward()
+ # 8.3 minimize
+ if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:
+ optimizer.step()
+ optimizer.clear_grad()
+ else: # general case
+ # 8.2 backward
+ avg_loss.backward()
+ # 8.3 minimize
+ optimizer.step()
+ optimizer.clear_grad()
+
+ # log record
+ record_list['lr'].update(optimizer.get_lr(), batch_size)
+ for name, value in outputs.items():
+ if name in record_list:
+ record_list[name].update(value, batch_size)
+
+ record_list['batch_time'].update(time.time() - tic)
+ tic = time.time()
+
+ if i % cfg.get("log_interval", 10) == 0:
+ ips = "ips: {:.5f} instance/sec.".format(
+ batch_size / record_list["batch_time"].val)
+ log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips)
+
+ # learning rate iter step
+ if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+ lr.step()
+
+ # learning rate epoch step
+ if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+ lr.step()
+
+ ips = "avg_ips: {:.5f} instance/sec.".format(
+ batch_size * record_list["batch_time"].count /
+ record_list["batch_time"].sum)
+ log_epoch(record_list, epoch + 1, "train", ips)
+
+ def evaluate(best):
+ model.eval()
+ results = []
+ record_list = build_record(cfg.MODEL)
+ record_list.pop('lr')
+ tic = time.time()
+ if parallel:
+ rank = dist.get_rank()
+ # single_gpu_test and multi_gpu_test
+ for i, data in enumerate(valid_loader):
+ """Next two line of code only used in test_tipc,
+ ignore it most of the time"""
+ if max_iters is not None and i >= max_iters:
+ break
+
+ if use_amp:
+ with amp.auto_cast(custom_black_list={"reduce_mean"},
+ level=amp_level):
+ outputs = model(data, mode='valid')
+ else:
+ outputs = model(data, mode='valid')
+
+ if cfg.MODEL.framework == "FastRCNN":
+ results.extend(outputs)
+
+ # log_record
+ if cfg.MODEL.framework != "FastRCNN":
+ for name, value in outputs.items():
+ if name in record_list:
+ record_list[name].update(value, batch_size)
+
+ record_list['batch_time'].update(time.time() - tic)
+ tic = time.time()
+
+ if i % cfg.get("log_interval", 10) == 0:
+ ips = "ips: {:.5f} instance/sec.".format(
+ valid_batch_size / record_list["batch_time"].val)
+ log_batch(record_list, i, epoch + 1, cfg.epochs, "val", ips)
+
+ if cfg.MODEL.framework == "FastRCNN":
+ if parallel:
+ results = collect_results_cpu(results, len(valid_dataset))
+ if not parallel or (parallel and rank == 0):
+ eval_res = valid_dataset.evaluate(results)
+ for name, value in eval_res.items():
+ record_list[name].update(value, valid_batch_size)
+
+ ips = "avg_ips: {:.5f} instance/sec.".format(
+ valid_batch_size * record_list["batch_time"].count /
+ record_list["batch_time"].sum)
+ log_epoch(record_list, epoch + 1, "val", ips)
+
+ best_flag = False
+ if cfg.MODEL.framework == "FastRCNN" and (not parallel or
+ (parallel and rank == 0)):
+ if record_list["mAP@0.5IOU"].val > best:
+ best = record_list["mAP@0.5IOU"].val
+ best_flag = True
+ return best, best_flag
+
+ # forbest2, cfg.MODEL.framework != "FastRCNN":
+ for top_flag in ['hit_at_one', 'top1', 'rmse', "F1@0.50"]:
+ if record_list.get(top_flag):
+ if top_flag != 'rmse' and record_list[top_flag].avg > best:
+ best = record_list[top_flag].avg
+ best_flag = True
+ elif top_flag == 'rmse' and (
+ best == 0.0 or record_list[top_flag].avg < best):
+ best = record_list[top_flag].avg
+ best_flag = True
+
+ return best, best_flag
+
+ # use precise bn to improve acc
+ if cfg.get("PRECISEBN") and (epoch % cfg.PRECISEBN.preciseBN_interval
+ == 0 or epoch == cfg.epochs - 1):
+ do_preciseBN(
+ model, train_loader, parallel,
+ min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)),
+ use_amp, amp_level)
+
+ # 9. Validation
+ if validate and (epoch % cfg.get("val_interval", 1) == 0
+ or epoch == cfg.epochs - 1):
+ with paddle.no_grad():
+ best, save_best_flag = evaluate(best)
+ # save best
+ if save_best_flag:
+ save(optimizer.state_dict(),
+ osp.join(output_dir, model_name + "_best.pdopt"))
+ save(model.state_dict(),
+ osp.join(output_dir, model_name + "_best.pdparams"))
+ if model_name == "AttentionLstm":
+ logger.info(
+ f"Already save the best model (hit_at_one){best}")
+ elif cfg.MODEL.framework == "FastRCNN":
+ logger.info(
+ f"Already save the best model (mAP@0.5IOU){int(best * 10000) / 10000}"
+ )
+ elif cfg.MODEL.framework == "DepthEstimator":
+ logger.info(
+ f"Already save the best model (rmse){int(best * 10000) / 10000}"
+ )
+ elif cfg.MODEL.framework in ['MSTCN', 'ASRF']:
+ logger.info(
+ f"Already save the best model (F1@0.50){int(best * 10000) / 10000}"
+ )
+ else:
+ logger.info(
+ f"Already save the best model (top1 acc){int(best * 10000) / 10000}"
+ )
+
+ # 10. Save model and optimizer
+ if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1:
+ save(
+ optimizer.state_dict(),
+ osp.join(output_dir,
+ model_name + f"_epoch_{epoch + 1:05d}.pdopt"))
+ save(
+ model.state_dict(),
+ osp.join(output_dir,
+ model_name + f"_epoch_{epoch + 1:05d}.pdparams"))
+
+ logger.info(f'training {model_name} finished')
diff --git a/paddlevideo/tasks/train_dali.py b/paddlevideo/tasks/train_dali.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd0a20f53fdb8035aad302461a03c3e5d34c01b
--- /dev/null
+++ b/paddlevideo/tasks/train_dali.py
@@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os.path as osp
+
+import paddle
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+from paddlevideo.utils import get_logger, coloring
+from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,
+ save, load, mkdir)
+from paddlevideo.loader import TSN_Dali_loader, get_input_data
+"""
+We only supported DALI training for TSN model now.
+"""
+
+
+def train_dali(cfg, weights=None, parallel=True):
+ """Train model entry
+
+ Args:
+ cfg (dict): configuration.
+ weights (str): weights path for finetuning.
+ parallel (bool): Whether multi-cards training. Default: True.
+
+ """
+
+ logger = get_logger("paddlevideo")
+ batch_size = cfg.DALI_LOADER.get('batch_size', 8)
+ places = paddle.set_device('gpu')
+ model_name = cfg.model_name
+ output_dir = cfg.get("output_dir", f"./output/{model_name}")
+ mkdir(output_dir)
+
+ # 1. Construct model
+ model = build_model(cfg.MODEL)
+ if parallel:
+ model = paddle.DataParallel(model)
+
+ # 2. Construct dali dataloader
+ train_loader = TSN_Dali_loader(cfg.DALI_LOADER).build_dali_reader()
+
+ # 3. Construct solver.
+ lr = build_lr(cfg.OPTIMIZER.learning_rate, None)
+ optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)
+
+ # Resume
+ resume_epoch = cfg.get("resume_epoch", 0)
+ if resume_epoch:
+ filename = osp.join(output_dir,
+ model_name + f"_epoch_{resume_epoch:05d}")
+ resume_model_dict = load(filename + '.pdparams')
+ resume_opt_dict = load(filename + '.pdopt')
+ model.set_state_dict(resume_model_dict)
+ optimizer.set_state_dict(resume_opt_dict)
+
+ # Finetune:
+ if weights:
+ assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it."
+ model_dict = load(weights)
+ model.set_state_dict(model_dict)
+
+ # 4. Train Model
+ for epoch in range(0, cfg.epochs):
+ if epoch < resume_epoch:
+ logger.info(
+ f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
+ )
+ continue
+ model.train()
+ record_list = build_record(cfg.MODEL)
+ tic = time.time()
+ for i, data in enumerate(train_loader):
+ data = get_input_data(data)
+ record_list['reader_time'].update(time.time() - tic)
+ # 4.1 forward
+ outputs = model(data, mode='train')
+ # 4.2 backward
+ avg_loss = outputs['loss']
+ avg_loss.backward()
+ # 4.3 minimize
+ optimizer.step()
+ optimizer.clear_grad()
+
+ # log record
+ record_list['lr'].update(optimizer._global_learning_rate(),
+ batch_size)
+ for name, value in outputs.items():
+ record_list[name].update(value, batch_size)
+
+ record_list['batch_time'].update(time.time() - tic)
+ tic = time.time()
+
+ if i % cfg.get("log_interval", 10) == 0:
+ ips = "ips: {:.5f} instance/sec.".format(
+ batch_size / record_list["batch_time"].val)
+ log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips)
+
+ # learning rate iter step
+ if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+ lr.step()
+
+ # learning rate epoch step
+ if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+ lr.step()
+
+ ips = "ips: {:.5f} instance/sec.".format(
+ batch_size * record_list["batch_time"].count /
+ record_list["batch_time"].sum)
+ log_epoch(record_list, epoch + 1, "train", ips)
+
+ # use precise bn to improve acc
+ if cfg.get("PRECISEBN") and (epoch % cfg.PRECISEBN.preciseBN_interval
+ == 0 or epoch == cfg.epochs - 1):
+ do_preciseBN(
+ model, train_loader, parallel,
+ min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))
+
+ # 5. Save model and optimizer
+ if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1:
+ save(
+ optimizer.state_dict(),
+ osp.join(output_dir,
+ model_name + f"_epoch_{epoch+1:05d}.pdopt"))
+ save(
+ model.state_dict(),
+ osp.join(output_dir,
+ model_name + f"_epoch_{epoch+1:05d}.pdparams"))
+
+ logger.info(f'training {model_name} finished')
diff --git a/paddlevideo/tasks/train_multigrid.py b/paddlevideo/tasks/train_multigrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e0065a71738e4aaf0c240aa75207077a882028
--- /dev/null
+++ b/paddlevideo/tasks/train_multigrid.py
@@ -0,0 +1,335 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os.path as osp
+
+import paddle
+import paddle.distributed as dist
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+from paddlevideo.utils import get_logger, coloring
+from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,
+ save, load, mkdir)
+from paddlevideo.utils.multigrid import MultigridSchedule, aggregate_sub_bn_stats, subn_load, subn_save, is_eval_epoch
+
+
+def construct_loader(cfg, places, validate, precise_bn, num_iters_precise_bn,
+ world_size):
+ batch_size = cfg.DATASET.get('batch_size', 2)
+ train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))
+ precise_bn_dataloader_setting = dict(
+ batch_size=batch_size,
+ num_workers=cfg.DATASET.get('num_workers', 0),
+ places=places,
+ )
+ if precise_bn:
+ cfg.DATASET.train.num_samples_precise_bn = num_iters_precise_bn * batch_size * world_size
+ precise_bn_dataset = build_dataset(
+ (cfg.DATASET.train, cfg.PIPELINE.train))
+ precise_bn_loader = build_dataloader(precise_bn_dataset,
+ **precise_bn_dataloader_setting)
+ cfg.DATASET.train.num_samples_precise_bn = None
+ else:
+ precise_bn_loader = None
+
+ if cfg.MULTIGRID.SHORT_CYCLE:
+ # get batch size list in short cycle schedule
+ bs_factor = [
+ int(
+ round((float(
+ cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'])
+ / (s * cfg.MULTIGRID.default_crop_size))**2))
+ for s in cfg.MULTIGRID.short_cycle_factors
+ ]
+ batch_sizes = [
+ batch_size * bs_factor[0],
+ batch_size * bs_factor[1],
+ batch_size,
+ ]
+ train_dataloader_setting = dict(
+ batch_size=batch_sizes,
+ multigrid=True,
+ num_workers=cfg.DATASET.get('num_workers', 0),
+ places=places,
+ )
+ else:
+ train_dataloader_setting = precise_bn_dataloader_setting
+
+ train_loader = build_dataloader(train_dataset, **train_dataloader_setting)
+ if validate:
+ valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))
+ validate_dataloader_setting = dict(batch_size=batch_size,
+ num_workers=cfg.DATASET.get(
+ 'num_workers', 0),
+ places=places,
+ drop_last=False,
+ shuffle=False)
+ valid_loader = build_dataloader(valid_dataset,
+ **validate_dataloader_setting)
+ else:
+ valid_loader = None
+
+ return train_loader, valid_loader, precise_bn_loader
+
+
+def build_trainer(cfg, places, parallel, validate, precise_bn,
+ num_iters_precise_bn, world_size):
+ """
+ Build training model and its associated tools, including optimizer,
+ dataloaders and meters.
+ Args:
+ cfg (CfgNode): configs.
+ Returns:
+ model: training model.
+ optimizer: optimizer.
+ train_loader: training data loader.
+ val_loader: validatoin data loader.
+ precise_bn_loader: training data loader for computing
+ precise BN.
+ """
+ model = build_model(cfg.MODEL)
+ if parallel:
+ model = paddle.DataParallel(model)
+
+ train_loader, valid_loader, precise_bn_loader = \
+ construct_loader(cfg,
+ places,
+ validate,
+ precise_bn,
+ num_iters_precise_bn,
+ world_size,
+ )
+
+ lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+ optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)
+
+ return (
+ model,
+ lr,
+ optimizer,
+ train_loader,
+ valid_loader,
+ precise_bn_loader,
+ )
+
+
+def train_model_multigrid(cfg, world_size=1, validate=True):
+ """Train model entry
+
+ Args:
+ cfg (dict): configuration.
+ parallel (bool): Whether multi-card training. Default: True
+ validate (bool): Whether to do evaluation. Default: False.
+
+ """
+ # Init multigrid.
+ multigrid = None
+ if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:
+ multigrid = MultigridSchedule()
+ cfg = multigrid.init_multigrid(cfg)
+ if cfg.MULTIGRID.LONG_CYCLE:
+ cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)
+ multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule]
+
+ parallel = world_size != 1
+ logger = get_logger("paddlevideo")
+ batch_size = cfg.DATASET.get('batch_size', 2)
+
+ if cfg.get('use_npu'):
+ places = paddle.set_device('npu')
+ else:
+ places = paddle.set_device('gpu')
+
+ model_name = cfg.model_name
+ output_dir = cfg.get("output_dir", f"./output/{model_name}")
+ mkdir(output_dir)
+ local_rank = dist.ParallelEnv().local_rank
+ precise_bn = cfg.get("PRECISEBN")
+ num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN
+
+ # 1. Construct model
+ model = build_model(cfg.MODEL)
+ if parallel:
+ model = paddle.DataParallel(model)
+
+ # 2. Construct dataloader
+ train_loader, valid_loader, precise_bn_loader = \
+ construct_loader(cfg,
+ places,
+ validate,
+ precise_bn,
+ num_iters_precise_bn,
+ world_size,
+ )
+
+ # 3. Construct optimizer
+ lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+ optimizer = build_optimizer(cfg.OPTIMIZER,
+ lr,
+ parameter_list=model.parameters())
+
+ # Resume
+ resume_epoch = cfg.get("resume_epoch", 0)
+ if resume_epoch:
+ filename = osp.join(
+ output_dir,
+ model_name + str(local_rank) + '_' + f"{resume_epoch:05d}")
+ subn_load(model, filename, optimizer)
+
+ # 4. Train Model
+ best = 0.
+ total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor)
+ for epoch in range(total_epochs):
+ if epoch < resume_epoch:
+ logger.info(
+ f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
+ )
+ continue
+
+ if cfg.MULTIGRID.LONG_CYCLE:
+ cfg, changed = multigrid.update_long_cycle(cfg, epoch)
+ if changed:
+ logger.info("====== Rebuild model/optimizer/loader =====")
+ (
+ model,
+ lr,
+ optimizer,
+ train_loader,
+ valid_loader,
+ precise_bn_loader,
+ ) = build_trainer(cfg, places, parallel, validate, precise_bn,
+ num_iters_precise_bn, world_size)
+
+ #load checkpoint after re-build model
+ if epoch != 0:
+ #epoch no need to -1, haved add 1 when save
+ filename = osp.join(
+ output_dir,
+ model_name + str(local_rank) + '_' + f"{(epoch):05d}")
+ subn_load(model, filename, optimizer)
+ #update lr last epoch, not to use saved params
+ lr.last_epoch = epoch
+ lr.step(rebuild=True)
+
+ model.train()
+ record_list = build_record(cfg.MODEL)
+ tic = time.time()
+ for i, data in enumerate(train_loader):
+ record_list['reader_time'].update(time.time() - tic)
+ # 4.1 forward
+ outputs = model(data, mode='train')
+ # 4.2 backward
+ avg_loss = outputs['loss']
+ avg_loss.backward()
+ # 4.3 minimize
+ optimizer.step()
+ optimizer.clear_grad()
+
+ # log record
+ record_list['lr'].update(
+ optimizer._global_learning_rate().numpy()[0], batch_size)
+ for name, value in outputs.items():
+ record_list[name].update(value.numpy()[0], batch_size)
+ record_list['batch_time'].update(time.time() - tic)
+ tic = time.time()
+
+ if i % cfg.get("log_interval", 10) == 0:
+ ips = "ips: {:.5f} instance/sec.".format(
+ batch_size / record_list["batch_time"].val)
+ log_batch(record_list, i, epoch + 1, total_epochs, "train", ips)
+
+ # learning rate iter step
+ if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+ lr.step()
+
+ # learning rate epoch step
+ if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+ lr.step()
+
+ ips = "ips: {:.5f} instance/sec.".format(
+ batch_size * record_list["batch_time"].count /
+ record_list["batch_time"].sum)
+ log_epoch(record_list, epoch + 1, "train", ips)
+
+ def evaluate(best):
+ model.eval()
+ record_list = build_record(cfg.MODEL)
+ record_list.pop('lr')
+ tic = time.time()
+ for i, data in enumerate(valid_loader):
+ outputs = model(data, mode='valid')
+
+ # log_record
+ for name, value in outputs.items():
+ record_list[name].update(value.numpy()[0], batch_size)
+
+ record_list['batch_time'].update(time.time() - tic)
+ tic = time.time()
+
+ if i % cfg.get("log_interval", 10) == 0:
+ ips = "ips: {:.5f} instance/sec.".format(
+ batch_size / record_list["batch_time"].val)
+ log_batch(record_list, i, epoch + 1, total_epochs, "val",
+ ips)
+
+ ips = "ips: {:.5f} instance/sec.".format(
+ batch_size * record_list["batch_time"].count /
+ record_list["batch_time"].sum)
+ log_epoch(record_list, epoch + 1, "val", ips)
+
+ best_flag = False
+ if record_list.get('top1') and record_list['top1'].avg > best:
+ best = record_list['top1'].avg
+ best_flag = True
+ return best, best_flag
+
+ # use precise bn to improve acc
+ if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
+ logger.info(f"do precise BN in {epoch+1} ...")
+ do_preciseBN(model, precise_bn_loader, parallel,
+ min(num_iters_precise_bn, len(precise_bn_loader)))
+
+ # aggregate sub_BN stats
+ logger.info("Aggregate sub_BatchNorm stats...")
+ aggregate_sub_bn_stats(model)
+
+ # 5. Validation
+ if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
+ logger.info(f"eval in {epoch+1} ...")
+ with paddle.no_grad():
+ best, save_best_flag = evaluate(best)
+ # save best
+ if save_best_flag:
+ save(optimizer.state_dict(),
+ osp.join(output_dir, model_name + "_best.pdopt"))
+ save(model.state_dict(),
+ osp.join(output_dir, model_name + "_best.pdparams"))
+ logger.info(
+ f"Already save the best model (top1 acc){int(best * 10000) / 10000}"
+ )
+
+ # 6. Save model and optimizer
+ if is_eval_epoch(
+ cfg, epoch,
+ total_epochs, multigrid.schedule) or epoch % cfg.get(
+ "save_interval", 10) == 0 or epoch in multi_save_epoch:
+ logger.info("[Save parameters] ======")
+ subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1,
+ model, optimizer)
+
+ logger.info(f'training {model_name} finished')
diff --git a/paddlevideo/utils/__init__.py b/paddlevideo/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d18561d76e9424e06f596a2baa92ca2b7fe430cc
--- /dev/null
+++ b/paddlevideo/utils/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import Registry
+from .build_utils import build
+from .config import *
+from .logger import setup_logger, coloring, get_logger
+from .record import AverageMeter, build_record, log_batch, log_epoch
+from .dist_utils import get_dist_info, main_only
+from .save_load import save, load, load_ckpt, mkdir
+from .precise_bn import do_preciseBN
+from .profiler import add_profiler_step
+__all__ = ['Registry', 'build']
diff --git a/paddlevideo/utils/build_utils.py b/paddlevideo/utils/build_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c0ca46bae477837411a010459a4ed08549d2ee
--- /dev/null
+++ b/paddlevideo/utils/build_utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def build(cfg, registry, key='name'):
+ """Build a module from config dict.
+ Args:
+ cfg (dict): Config dict. It should at least contain the key.
+ registry (XXX): The registry to search the type from.
+ key (str): the key.
+ Returns:
+ obj: The constructed object.
+ """
+
+ assert isinstance(cfg, dict) and key in cfg
+
+ cfg_copy = cfg.copy()
+ obj_type = cfg_copy.pop(key)
+
+ obj_cls = registry.get(obj_type)
+ if obj_cls is None:
+ raise KeyError('{} is not in the {} registry'.format(
+ obj_type, registry.name))
+ return obj_cls(**cfg_copy)
diff --git a/paddlevideo/utils/config.py b/paddlevideo/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4d794116f5011486f5c0fc9276681d36fdcf531
--- /dev/null
+++ b/paddlevideo/utils/config.py
@@ -0,0 +1,174 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+from paddlevideo.utils.logger import coloring, get_logger, setup_logger
+
+__all__ = ['get_config']
+
+logger = setup_logger("./", name="paddlevideo", level="INFO")
+
+
+class AttrDict(dict):
+ def __getattr__(self, key):
+ return self[key]
+
+ def __setattr__(self, key, value):
+ if key in self.__dict__:
+ self.__dict__[key] = value
+ else:
+ self[key] = value
+
+
+def create_attr_dict(yaml_config):
+ from ast import literal_eval
+ for key, value in yaml_config.items():
+ if type(value) is dict:
+ yaml_config[key] = value = AttrDict(value)
+ if isinstance(value, str):
+ try:
+ value = literal_eval(value)
+ except BaseException:
+ pass
+ if isinstance(value, AttrDict):
+ create_attr_dict(yaml_config[key])
+ else:
+ yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+ """Load a config file into AttrDict"""
+ with open(cfg_file, 'r') as fopen:
+ yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+ create_attr_dict(yaml_config)
+ return yaml_config
+
+
+def print_dict(d, delimiter=0):
+ """
+ Recursively visualize a dict and
+ indenting acrrording by the relationship of keys.
+ """
+ placeholder = "-" * 60
+ for k, v in sorted(d.items()):
+ if isinstance(v, dict):
+ logger.info("{}{} : ".format(delimiter * " ", coloring(k,
+ "HEADER")))
+ print_dict(v, delimiter + 4)
+ elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+ logger.info("{}{} : ".format(delimiter * " ",
+ coloring(str(k), "HEADER")))
+ for value in v:
+ print_dict(value, delimiter + 4)
+ else:
+ logger.info("{}{} : {}".format(delimiter * " ",
+ coloring(k, "HEADER"),
+ coloring(v, "OKGREEN")))
+
+ if k.isupper():
+ logger.info(placeholder)
+
+
+def print_config(config):
+ """
+ visualize configs
+ Arguments:
+ config: configs
+ """
+ print_dict(config)
+
+
+def check_config(config):
+ """
+ Check config
+ """
+ pass
+
+
+def override(dl, ks, v):
+ """
+ Recursively replace dict of list
+ Args:
+ dl(dict or list): dict or list to be replaced
+ ks(list): list of keys
+ v(str): value to be replaced
+ """
+ def str2num(v):
+ try:
+ return eval(v)
+ except Exception:
+ return v
+
+ assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+ assert len(ks) > 0, ('lenght of keys should larger than 0')
+ if isinstance(dl, list):
+ k = str2num(ks[0])
+ if len(ks) == 1:
+ assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+ dl[k] = str2num(v)
+ else:
+ override(dl[k], ks[1:], v)
+ else:
+ if len(ks) == 1:
+ #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+ if not ks[0] in dl:
+ logger.warning('A new filed ({}) detected!'.format(ks[0], dl))
+ dl[ks[0]] = str2num(v)
+ else:
+ assert ks[0] in dl, (
+ '({}) doesn\'t exist in {}, a new dict field is invalid'.format(
+ ks[0], dl))
+ override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+ """
+ Recursively override the config
+ Args:
+ config(dict): dict to be replaced
+ options(list): list of pairs(key0.key1.idx.key2=value)
+ such as: [
+ epochs=20',
+ 'PIPELINE.train.transform.1.ResizeImage.resize_short=300'
+ ]
+ Returns:
+ config(dict): replaced config
+ """
+ if options is not None:
+ for opt in options:
+ assert isinstance(opt,
+ str), ("option({}) should be a str".format(opt))
+ assert "=" in opt, (
+ "option({}) should contain a ="
+ "to distinguish between key and value".format(opt))
+ pair = opt.split('=')
+ assert len(pair) == 2, ("there can be only a = in the option")
+ key, value = pair
+ keys = key.split('.')
+ override(config, keys, value)
+
+ return config
+
+
+def get_config(fname, overrides=None, show=True):
+ """
+ Read config from file
+ """
+ assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))
+ config = parse_config(fname)
+ override_config(config, overrides)
+ if show:
+ print_config(config)
+ check_config(config)
+ return config
diff --git a/paddlevideo/utils/dist_utils.py b/paddlevideo/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7659e88c127aa2312777c5cdd3d0afbdcdf07e6c
--- /dev/null
+++ b/paddlevideo/utils/dist_utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+
+import paddle
+import paddle.distributed as dist
+
+def get_dist_info():
+ world_size = dist.get_world_size()
+ rank = dist.get_rank()
+ return rank, world_size
+
+def main_only(func):
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ rank, _ = get_dist_info()
+ if rank == 0:
+ return func(*args, **kwargs)
+ return wrapper
diff --git a/paddlevideo/utils/logger.py b/paddlevideo/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9791b89b68ddd302e32f350ab7b96eda9f4ce36
--- /dev/null
+++ b/paddlevideo/utils/logger.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import datetime
+
+from paddle.distributed import ParallelEnv
+
+
+
+Color = {
+ 'RED': '\033[31m',
+ 'HEADER': '\033[35m', # deep purple
+ 'PURPLE': '\033[95m', # purple
+ 'OKBLUE': '\033[94m',
+ 'OKGREEN': '\033[92m',
+ 'WARNING': '\033[93m',
+ 'FAIL': '\033[91m',
+ 'ENDC': '\033[0m'
+}
+
+
+def coloring(message, color="OKGREEN"):
+ assert color in Color.keys()
+ if os.environ.get('COLORING', True):
+ return Color[color] + str(message) + Color["ENDC"]
+ else:
+ return message
+
+
+logger_initialized = []
+
+
+def setup_logger(output=None, name="paddlevideo", level="INFO"):
+ """
+ Initialize the paddlevideo logger and set its verbosity level to "INFO".
+ Args:
+ output (str): a file name or a directory to save log. If None, will not save log file.
+ If ends with ".txt" or ".log", assumed to be a file name.
+ Otherwise, logs will be saved to `output/log.txt`.
+ name (str): the root module name of this logger
+ Returns:
+ logging.Logger: a logger
+ """
+ def time_zone(sec, fmt):
+ real_time = datetime.datetime.now()
+ return real_time.timetuple()
+ logging.Formatter.converter = time_zone
+
+ logger = logging.getLogger(name)
+ if level == "INFO":
+ logger.setLevel(logging.INFO)
+ elif level=="DEBUG":
+ logger.setLevel(logging.DEBUG)
+ logger.propagate = False
+
+ if level == "DEBUG":
+ plain_formatter = logging.Formatter(
+ "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
+ datefmt="%m/%d %H:%M:%S")
+ else:
+ plain_formatter = logging.Formatter(
+ "[%(asctime)s] %(message)s",
+ datefmt="%m/%d %H:%M:%S")
+ # stdout logging: master only
+ local_rank = ParallelEnv().local_rank
+ if local_rank == 0:
+ ch = logging.StreamHandler(stream=sys.stdout)
+ ch.setLevel(logging.DEBUG)
+ formatter = plain_formatter
+ ch.setFormatter(formatter)
+ logger.addHandler(ch)
+
+ # file logging: all workers
+ if output is not None:
+ if output.endswith(".txt") or output.endswith(".log"):
+ filename = output
+ else:
+ filename = os.path.join(output, ".log.txt")
+ if local_rank > 0:
+ filename = filename + ".rank{}".format(local_rank)
+
+ # PathManager.mkdirs(os.path.dirname(filename))
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+ # fh = logging.StreamHandler(_cached_log_stream(filename)
+ fh = logging.FileHandler(filename, mode='a')
+ fh.setLevel(logging.DEBUG)
+ fh.setFormatter(plain_formatter)
+ logger.addHandler(fh)
+ logger_initialized.append(name)
+ return logger
+
+
+def get_logger(name, output=None):
+ logger = logging.getLogger(name)
+ if name in logger_initialized:
+ return logger
+
+ return setup_logger(name=name, output=name)
diff --git a/paddlevideo/utils/multigrid/__init__.py b/paddlevideo/utils/multigrid/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10295b59b7ccce4504b73a9b315f836d6c07a777
--- /dev/null
+++ b/paddlevideo/utils/multigrid/__init__.py
@@ -0,0 +1,10 @@
+from .multigrid import MultigridSchedule
+from .batchnorm_helper import get_norm, aggregate_sub_bn_stats
+from .short_sampler import DistributedShortSampler
+from .save_load_helper import subn_save, subn_load
+from .interval_helper import is_eval_epoch
+
+__all__ = [
+ 'MultigridSchedule', 'get_norm', 'aggregate_sub_bn_stats',
+ 'DistributedShortSampler', 'subn_save', 'subn_load', 'is_eval_epoch'
+]
diff --git a/paddlevideo/utils/multigrid/batchnorm_helper.py b/paddlevideo/utils/multigrid/batchnorm_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e39b067d863ec7fe49111fbd7cfcbe1f4a28e3b5
--- /dev/null
+++ b/paddlevideo/utils/multigrid/batchnorm_helper.py
@@ -0,0 +1,142 @@
+from functools import partial
+import paddle
+
+
+def get_norm(bn_norm_type, bn_num_splits):
+ """
+ Args:
+ cfg (CfgNode): model building configs, details are in the comments of
+ the config file.
+ Returns:
+ nn.Layer: the normalization layer.
+ """
+ if bn_norm_type == "batchnorm":
+ return paddle.nn.BatchNorm3D
+ elif bn_norm_type == "sub_batchnorm":
+ return partial(SubBatchNorm3D, num_splits=bn_num_splits)
+ else:
+ raise NotImplementedError(
+ "Norm type {} is not supported".format(bn_norm_type))
+
+
+def aggregate_sub_bn_stats(model):
+ """
+ Recursively find all SubBN modules and aggregate sub-BN stats.
+ Args:
+ model (nn.Layer): model to be aggregate sub-BN stats
+ Returns:
+ count (int): number of SubBN module found.
+ """
+ count = 0
+ for child in model.children():
+ if isinstance(child, SubBatchNorm3D):
+ child.aggregate_stats()
+ count += 1
+ else:
+ count += aggregate_sub_bn_stats(child)
+ return count
+
+
+class SubBatchNorm3D(paddle.nn.Layer):
+ """
+ Implement based on paddle2.0.
+ The standard BN layer computes stats across all examples in a GPU. In some
+ cases it is desirable to compute stats across only a subset of examples
+ SubBatchNorm3D splits the batch dimension into N splits, and run BN on
+ each of them separately (so that the stats are computed on each subset of
+ examples (1/N of batch) independently. During evaluation, it aggregates
+ the stats from all splits into one BN.
+ """
+ def __init__(self, num_splits, **args):
+ """
+ Args:
+ num_splits (int): number of splits.
+ args (list): list of args
+ """
+ super(SubBatchNorm3D, self).__init__()
+ self.num_splits = num_splits
+ self.num_features = args["num_features"]
+ self.weight_attr = args["weight_attr"]
+ self.bias_attr = args["bias_attr"]
+
+ # Keep only one set of weight and bias (outside).
+ if self.weight_attr == False:
+ self.weight = self.create_parameter(
+ attr=None,
+ shape=[self.num_features],
+ default_initializer=paddle.nn.initializer.Constant(1.0))
+ self.weight.stop_gradient = True
+ else:
+ self.weight = self.create_parameter(
+ attr=self.weight_attr,
+ shape=[self.num_features],
+ default_initializer=paddle.nn.initializer.Constant(1.0))
+ self.weight.stop_gradient = self.weight_attr is not None \
+ and self.weight_attr.learning_rate == 0.
+
+ if self.bias_attr == False:
+ self.bias = self.create_parameter(attr=None,
+ shape=[self.num_features],
+ is_bias=True)
+ self.bias.stop_gradient = True
+ else:
+ self.bias = self.create_parameter(attr=self.bias_attr,
+ shape=[self.num_features],
+ is_bias=True)
+ self.bias.stop_gradient = self.bias_attr is not None \
+ and self.bias_attr.learning_rate == 0.
+
+ # set weights and bias fixed (inner).
+ args["weight_attr"] = False
+ args["bias_attr"] = False
+ self.bn = paddle.nn.BatchNorm3D(**args)
+ # update number of features used in split_bn
+ args["num_features"] = self.num_features * self.num_splits
+ self.split_bn = paddle.nn.BatchNorm3D(**args)
+
+ def _get_aggregated_mean_std(self, means, stds, n):
+ """
+ Calculate the aggregated mean and stds.
+ Use the method of update mean and std when merge multi-part data.
+ Args:
+ means (tensor): mean values.
+ stds (tensor): standard deviations.
+ n (int): number of sets of means and stds.
+ """
+ mean = paddle.sum(paddle.reshape(means, (n, -1)), axis=0) / n
+ std = (paddle.sum(paddle.reshape(stds, (n, -1)), axis=0) / n +
+ paddle.sum(paddle.reshape(
+ paddle.pow((paddle.reshape(means, (n, -1)) - mean), 2),
+ (n, -1)),
+ axis=0) / n)
+ return mean, std
+
+ def aggregate_stats(self):
+ """
+ Synchronize running_mean, and running_var to self.bn.
+ Call this before eval, then call model.eval();
+ When eval, forward function will call self.bn instead of self.split_bn,
+ During this time the running_mean, and running_var of self.bn has been obtained from
+ self.split_bn.
+ """
+ if self.split_bn.training:
+ bn_mean_tensor, bn_variance_tensor = self._get_aggregated_mean_std(
+ self.split_bn._mean,
+ self.split_bn._variance,
+ self.num_splits,
+ )
+ self.bn._mean.set_value(bn_mean_tensor)
+ self.bn._variance.set_value(bn_variance_tensor)
+
+ def forward(self, x):
+ if self.training:
+ n, c, t, h, w = x.shape
+ x = paddle.reshape(
+ x, (n // self.num_splits, c * self.num_splits, t, h, w))
+ x = self.split_bn(x)
+ x = paddle.reshape(x, (n, c, t, h, w))
+ else:
+ x = self.bn(x)
+ x = paddle.multiply(x, paddle.reshape(self.weight, (-1, 1, 1, 1)))
+ x = paddle.add(x, paddle.reshape(self.bias, (-1, 1, 1, 1)))
+ return x
diff --git a/paddlevideo/utils/multigrid/interval_helper.py b/paddlevideo/utils/multigrid/interval_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df4bc702df4f8f22f12223ced5de5d7ff2babf0
--- /dev/null
+++ b/paddlevideo/utils/multigrid/interval_helper.py
@@ -0,0 +1,19 @@
+def is_eval_epoch(cfg, cur_epoch, total_epochs, multigrid_schedule):
+ """
+ Determine if the model should be evaluated at the current epoch.
+ Args:
+ cfg (CfgNode): configs. Details can be found in
+ slowfast/config/defaults.py
+ cur_epoch (int): current epoch.
+ multigrid_schedule (List): schedule for multigrid training.
+ """
+ if cur_epoch + 1 == total_epochs:
+ return True
+ if multigrid_schedule is not None:
+ prev_epoch = 0
+ for s in multigrid_schedule:
+ if cur_epoch < s[-1]:
+ period = max(
+ (s[-1] - prev_epoch) // cfg.MULTIGRID.EVAL_FREQ + 1, 1)
+ return (s[-1] - 1 - cur_epoch) % period == 0
+ prev_epoch = s[-1]
diff --git a/paddlevideo/utils/multigrid/multigrid.py b/paddlevideo/utils/multigrid/multigrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..a296a0608d682e56bd503953284faa403c5f97c8
--- /dev/null
+++ b/paddlevideo/utils/multigrid/multigrid.py
@@ -0,0 +1,233 @@
+"""Functions for multigrid training."""
+
+import numpy as np
+
+
+class MultigridSchedule(object):
+ """
+ This class defines multigrid training schedule and update cfg accordingly.
+ """
+ def init_multigrid(self, cfg):
+ """
+ Update cfg based on multigrid settings.
+ Args:
+ cfg (configs): configs that contains training and multigrid specific
+ hyperparameters.
+ Returns:
+ cfg (configs): the updated cfg.
+ """
+ self.schedule = None
+ # We may modify cfg.DATASET.batch_size, cfg.PIPELINE.train.decode_sampler.num_frames, and
+ # cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] during training, so we store their original
+ # value in cfg and use them as global variables.
+ cfg.MULTIGRID.default_batch_size = cfg.DATASET.batch_size # total bs,64
+ cfg.MULTIGRID.default_temporal_size = cfg.PIPELINE.train.decode_sampler.num_frames # 32
+ cfg.MULTIGRID.default_crop_size = cfg.PIPELINE.train.transform[1][
+ 'MultiCrop']['target_size'] # 224
+
+ if cfg.MULTIGRID.LONG_CYCLE:
+ self.schedule = self.get_long_cycle_schedule(cfg)
+ cfg.OPTIMIZER.learning_rate.steps = [0] + [
+ s[-1] for s in self.schedule
+ ]
+ # Fine-tuning phase.
+ cfg.OPTIMIZER.learning_rate.steps[-1] = (
+ cfg.OPTIMIZER.learning_rate.steps[-2] +
+ cfg.OPTIMIZER.learning_rate.steps[-1]) // 2
+ cfg.OPTIMIZER.learning_rate.lrs = [
+ cfg.OPTIMIZER.learning_rate.gamma**s[0] * s[1][0]
+ for s in self.schedule
+ ]
+ # Fine-tuning phase.
+ cfg.OPTIMIZER.learning_rate.lrs = cfg.OPTIMIZER.learning_rate.lrs[:-1] + [
+ cfg.OPTIMIZER.learning_rate.lrs[-2],
+ cfg.OPTIMIZER.learning_rate.lrs[-1],
+ ]
+
+ cfg.OPTIMIZER.learning_rate.max_epoch = self.schedule[-1][-1]
+
+ elif cfg.MULTIGRID.SHORT_CYCLE:
+ cfg.OPTIMIZER.learning_rate.steps = [
+ int(s * cfg.MULTIGRID.epoch_factor)
+ for s in cfg.OPTIMIZER.learning_rate.steps
+ ]
+ cfg.OPTIMIZER.learning_rate.max_epoch = int(
+ cfg.OPTIMIZER.learning_rate.max_epoch *
+ cfg.OPTIMIZER.learning_rate.max_epoch)
+ return cfg
+
+ def update_long_cycle(self, cfg, cur_epoch):
+ """
+ Before every epoch, check if long cycle shape should change. If it
+ should, update cfg accordingly.
+ Args:
+ cfg (configs): configs that contains training and multigrid specific
+ hyperparameters.
+ cur_epoch (int): current epoch index.
+ Returns:
+ cfg (configs): the updated cfg.
+ changed (bool): whether to change long cycle shape at this epoch
+ """
+ base_b, base_t, base_s = get_current_long_cycle_shape(
+ self.schedule, cur_epoch)
+ if base_s != cfg.PIPELINE.train.transform[1]['MultiCrop'][
+ 'target_size'] or base_t != cfg.PIPELINE.train.decode_sampler.num_frames:
+ #NOTE Modify
+ # no need to modify, used by pool_size in head, None when multigrid
+ # cfg.MODEL.head.num_frames = base_t
+ # cfg.MODEL.head.crop_size = base_s
+ cfg.PIPELINE.train.decode_sampler.num_frames = base_t
+ cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] = base_s
+ cfg.DATASET.batch_size = base_b * cfg.MULTIGRID.default_batch_size #change bs
+
+ bs_factor = (float(cfg.DATASET.batch_size) /
+ cfg.MULTIGRID.bn_base_size)
+
+ if bs_factor == 1: #single bs == bn_base_size (== 8)
+ cfg.MODEL.backbone.bn_norm_type = "batchnorm"
+ else:
+ cfg.MODEL.backbone.bn_norm_type = "sub_batchnorm"
+ cfg.MODEL.backbone.bn_num_splits = int(bs_factor)
+
+ cfg.MULTIGRID.long_cycle_sampling_rate = cfg.PIPELINE.train.decode_sampler.sampling_rate * (
+ cfg.MULTIGRID.default_temporal_size // base_t)
+ print("Long cycle updates:")
+ print("\tbn_norm_type: {}".format(cfg.MODEL.backbone.bn_norm_type))
+ if cfg.MODEL.backbone.bn_norm_type == "sub_batchnorm":
+ print("\tbn_num_splits: {}".format(
+ cfg.MODEL.backbone.bn_num_splits))
+ print("\tTRAIN.batch_size[single card]: {}".format(
+ cfg.DATASET.batch_size))
+ print("\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format(
+ cfg.PIPELINE.train.decode_sampler.num_frames,
+ cfg.MULTIGRID.long_cycle_sampling_rate))
+ print("\tDATA.train_crop_size: {}".format(
+ cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']))
+ return cfg, True
+ else:
+ return cfg, False
+
+ def get_long_cycle_schedule(self, cfg):
+ """
+ Based on multigrid hyperparameters, define the schedule of a long cycle.
+ Args:
+ cfg (configs): configs that contains training and multigrid specific
+ hyperparameters.
+ Returns:
+ schedule (list): Specifies a list long cycle base shapes and their
+ corresponding training epochs.
+ """
+
+ steps = cfg.OPTIMIZER.learning_rate.steps
+
+ default_size = float(
+ cfg.PIPELINE.train.decode_sampler.num_frames *
+ cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']**
+ 2) # 32 * 224 * 224 C*H*W
+ default_iters = steps[-1] # 196
+
+ # Get shapes and average batch size for each long cycle shape.
+ avg_bs = []
+ all_shapes = []
+ # for t_factor, s_factor in cfg.MULTIGRID.long_cycle_factors:
+ for item in cfg.MULTIGRID.long_cycle_factors:
+ t_factor, s_factor = item["value"]
+ base_t = int(
+ round(cfg.PIPELINE.train.decode_sampler.num_frames * t_factor))
+ base_s = int(
+ round(
+ cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']
+ * s_factor))
+ if cfg.MULTIGRID.SHORT_CYCLE:
+ shapes = [
+ [
+ base_t,
+ cfg.MULTIGRID.default_crop_size *
+ cfg.MULTIGRID.short_cycle_factors[0],
+ ],
+ [
+ base_t,
+ cfg.MULTIGRID.default_crop_size *
+ cfg.MULTIGRID.short_cycle_factors[1],
+ ],
+ [base_t, base_s],
+ ] #first two is short_cycle, last is the base long_cycle
+ else:
+ shapes = [[base_t, base_s]]
+
+ # (T, S) -> (B, T, S)
+ shapes = [[
+ int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]
+ ] for s in shapes]
+ avg_bs.append(np.mean([s[0] for s in shapes]))
+ all_shapes.append(shapes)
+
+ # Get schedule regardless of cfg.MULTIGRID.epoch_factor.
+ total_iters = 0
+ schedule = []
+ for step_index in range(len(steps) - 1):
+ step_epochs = steps[step_index + 1] - steps[step_index]
+
+ for long_cycle_index, shapes in enumerate(all_shapes):
+ #ensure each of 4 sequences run the same num of iters
+ cur_epochs = (step_epochs * avg_bs[long_cycle_index] /
+ sum(avg_bs))
+
+ # get cur_iters from cur_epochs
+ cur_iters = cur_epochs / avg_bs[long_cycle_index]
+ total_iters += cur_iters
+ schedule.append((step_index, shapes[-1], cur_epochs))
+
+ iter_saving = default_iters / total_iters # ratio between default iters and real iters
+
+ final_step_epochs = cfg.OPTIMIZER.learning_rate.max_epoch - steps[-1]
+
+ # We define the fine-tuning phase to have the same amount of iteration
+ # saving as the rest of the training.
+ #final_step_epochs / iter_saving make fine-tune having the same iters as training
+ ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]
+
+ # schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))
+ schedule.append((step_index + 1, all_shapes[-1][-1], ft_epochs))
+
+ # Obtrain final schedule given desired cfg.MULTIGRID.epoch_factor.
+ x = (cfg.OPTIMIZER.learning_rate.max_epoch *
+ cfg.MULTIGRID.epoch_factor / sum(s[-1] for s in schedule))
+
+ final_schedule = []
+ total_epochs = 0
+ for s in schedule:
+ epochs = s[2] * x
+ total_epochs += epochs
+ final_schedule.append((s[0], s[1], int(round(total_epochs))))
+ print_schedule(final_schedule)
+ return final_schedule
+
+
+def print_schedule(schedule):
+ """
+ Log schedule.
+ """
+ print(
+ "Long_cycle_index\tBase_shape(bs_factor,temporal_size,crop_size)\tEpochs"
+ )
+ for s in schedule:
+ print("{}\t\t\t{}\t\t\t\t\t{}".format(s[0], s[1], s[2]))
+
+
+def get_current_long_cycle_shape(schedule, epoch):
+ """
+ Given a schedule and epoch index, return the long cycle base shape.
+ Args:
+ schedule (configs): configs that contains training and multigrid specific
+ hyperparameters.
+ cur_epoch (int): current epoch index.
+ Returns:
+ shapes (list): A list describing the base shape in a long cycle:
+ [batch size relative to default,
+ number of frames, spatial dimension].
+ """
+ for s in schedule:
+ if epoch < s[-1]:
+ return s[1]
+ return schedule[-1][1]
diff --git a/paddlevideo/utils/multigrid/save_load_helper.py b/paddlevideo/utils/multigrid/save_load_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a52d58b1deebafc41f78aff6e9a95e1322b71f
--- /dev/null
+++ b/paddlevideo/utils/multigrid/save_load_helper.py
@@ -0,0 +1,237 @@
+import os
+import numpy as np
+import paddle
+import copy
+
+
+def sub_to_normal_bn(sd):
+ """
+ When save, Convert the Sub-BN paprameters to normal BN parameters in a state dict.
+ There are two copies of BN layers in a Sub-BN implementation: `bn.bn` and
+ `bn.split_bn`. `bn.split_bn` is used during training and
+ "compute_precise_bn". Before saving or evaluation, its stats are copied to
+ `bn.bn`. We rename `bn.bn` to `bn` and store it to be consistent with normal
+ BN layers.
+ Args:
+ sd (OrderedDict): a dict of parameters which might contain Sub-BN
+ parameters.
+ Returns:
+ new_sd (OrderedDict): a dict with Sub-BN parameters reshaped to
+ normal parameters.
+ """
+ modifications = [
+ ("bn.bn._mean", "bn._mean"),
+ ("bn.bn._variance", "bn._variance"),
+ ]
+ to_remove = ["bn.bn.", ".split_bn."]
+ key_list = list(sd.keys()) #odict_keys to list
+ for key in key_list:
+ for before, after in modifications:
+ if key.endswith(before):
+ new_key = key.split(before)[0] + after
+ sd[new_key] = sd.pop(key)
+
+ for rm in to_remove:
+ if rm in key and key in sd:
+ del sd[key]
+
+
+def normal_to_sub_bn(checkpoint_sd, model_sd):
+ """
+ When load, Convert BN parameters to Sub-BN parameters if model contains Sub-BNs.
+ Args:
+ checkpoint_sd (OrderedDict): source dict of parameters.
+ model_sd (OrderedDict): target dict of parameters.
+ Returns:
+ new_sd (OrderedDict): converted dict of parameters.
+ """
+ for key in model_sd:
+ if key not in checkpoint_sd:
+ # not to replace bn.weight and bn.bias
+ if "bn.split_bn." in key and "bn.weight" not in key and "bn.bias" not in key:
+ load_key = key.replace("bn.split_bn.", "bn.")
+ bn_key = key.replace("bn.split_bn.", "bn.bn.")
+ checkpoint_sd[key] = checkpoint_sd.pop(load_key)
+ checkpoint_sd[bn_key] = checkpoint_sd[key]
+
+ # match the shape of bn.split_bn._xx
+ # model_sd: split_bn.rm.shape = num_feature*num_split
+ # checkpoint_sd: split_bn.rm.shape = bn.rm.shape = num_feature
+ for key in model_sd:
+ if key in checkpoint_sd:
+ model_blob_shape = model_sd[key].shape #bn.split_bn
+ c2_blob_shape = checkpoint_sd[key].shape #bn.bn
+
+ if (len(model_blob_shape) == 1 and len(c2_blob_shape) == 1
+ and model_blob_shape[0] > c2_blob_shape[0]
+ and model_blob_shape[0] % c2_blob_shape[0] == 0):
+ before_shape = checkpoint_sd[key].shape
+ checkpoint_sd[key] = np.concatenate(
+ [checkpoint_sd[key]] *
+ (model_blob_shape[0] // c2_blob_shape[0]))
+ if 'split_bn' not in key: #split_bn is excepted
+ print("{} {} -> {}".format(key, before_shape,
+ checkpoint_sd[key].shape))
+ return checkpoint_sd
+
+
+def mapping_opt_dict(opt_dict, model_key_list):
+ """
+ Paddle Name schedule: conv_1.w -> conv_2.w
+ Sometimes: sub_bn -> bn
+ when re-build model, we desire the parameter name to be coincident,
+ but the parameters name index will be added, as conv_1 to conv_2, not conv_1.
+ It will raise error if we set old saved parameters to new created optimizer.
+ as conv_2 cannot find in state_dict(only conv_1).
+ Args:
+ opt_dict: optimizer state dict, including the name and value of parameters gradient.
+ model_key_list: the parameters name list of re-build model.
+ Return: optimizer state dict with modified keys
+ """
+ def get_name_info(PNAME, PN_key_list, key_list):
+ min_index = float('inf')
+ max_index = 0
+ for name in PN_key_list[1:]:
+ for key in key_list:
+ if name in key:
+ index = int(key.split('.')[0].split(name)[-1])
+ if index < min_index:
+ min_index = index
+ if index > max_index:
+ max_index = index
+ num_name = max_index - min_index + 1
+ PNAME[name].append((min_index, max_index, num_name))
+ min_index = float('inf')
+ max_index = 0
+
+ PNAME = {
+ "LR_Scheduler": [],
+ "conv3d_": [],
+ "linear_": [],
+ "sub_batch_norm3d_": [],
+ "batch_norm3d_": [],
+ }
+
+ pd_key_list = list(opt_dict.keys())
+ print("The number of parameters in saved optimizer state dict = {}".format(
+ len(pd_key_list)))
+ print("The number of parameters in re-build model list = {}".format(
+ len(model_key_list)))
+ # 1 may be LR_Scheduler
+ PN_key_list = list(PNAME.keys())
+
+ # get the number of each PNAME
+ get_name_info(PNAME, PN_key_list, pd_key_list)
+ get_name_info(PNAME, PN_key_list, model_key_list)
+ print("[Parameters info] prefix: min_index, max_index, number_params: \n",
+ PNAME)
+
+ # whether to change name of bn layer
+ change_name = False
+ if PNAME["sub_batch_norm3d_"][0][-1] == -float('inf'):
+ PN_key_list.remove("sub_batch_norm3d_")
+ if PNAME["sub_batch_norm3d_"][1][-1] != -float('inf'):
+ print(
+ "Optimizer state dict saved bn, but Re-build model use sub_bn, changed name!"
+ )
+ change_name = True
+ else:
+ print("Optimizer state dict saved bn, and Re-build model use bn")
+ else:
+ PN_key_list.remove("batch_norm3d_")
+ if PNAME["sub_batch_norm3d_"][1][-1] == -float('inf'):
+ print(
+ "Optimizer state dict saved sub_bn, but Re-build model use bn, changed name!"
+ )
+ change_name = True
+ else:
+ print(
+ "Optimizer state dict saved sub_bn, Re-build model use sub_bn")
+
+ #update key name
+ # sub_bn -> bn name mapping, pre-define dict
+ change_dict = {
+ "sub_batch_norm3d_": "batch_norm3d_",
+ "batch_norm3d_": "sub_batch_norm3d_"
+ }
+ for key in pd_key_list:
+ for name in PN_key_list[1:]:
+ if key.startswith(name):
+ start = change_dict[name] if (
+ change_name and "batch_norm" in name) else name
+ str_index = key.split('.')[0].split(name)[-1]
+ index = int(str_index)
+ new_index = str(index +
+ (PNAME[start][1][0] - PNAME[name][0][0]))
+ end = key.split('.')[-1]
+ update_key = start + new_index + '.' + end
+ opt_dict[update_key] = opt_dict.pop(key)
+
+ return opt_dict
+
+
+def subn_save(save_dir, name_prefix, epoch, video_model, optimizer):
+ if not os.path.isdir(save_dir):
+ os.makedirs(save_dir)
+ model_path = os.path.join(save_dir, name_prefix + "{:05d}".format(epoch))
+ model_dict = video_model.state_dict()
+ sub_to_normal_bn(model_dict)
+ opti_dict = optimizer.state_dict()
+ paddle.save(model_dict, model_path + '.pdparams')
+ paddle.save(opti_dict, model_path + '.pdopt')
+ print('[Saved Epoch {} parameters and optimizer state ]'.format(epoch))
+
+
+def subn_load(model, ck_path, optimizer=None):
+ """
+ Load the checkpoint from the given file.
+ Args:
+ model (model): model to load the weights from the checkpoint.
+ optimizer (optim, optional): optimizer to load the historical state.
+ ck_path (str): checkpoint path
+ Returns:
+ (int): the number of training epoch of the checkpoint.
+ """
+
+ assert os.path.exists(ck_path + ".pdparams"), \
+ "Given dir {}.pdparams not exist.".format(ck_path)
+ print("load checkpint from {}.pdparams".format(ck_path))
+
+ model_dict = model.state_dict()
+ checkpoint_dict = paddle.load(ck_path + ".pdparams")
+ # checkpoint_dict = copy.deepcopy(checkpoint_dict_orig) #not modify when multi card
+ pre_train_dict = normal_to_sub_bn(checkpoint_dict, model_dict)
+
+ # Match pre-trained weights that have same shape as current model.
+ pre_train_dict_match = {
+ k: v
+ for k, v in pre_train_dict.items()
+ if k in model_dict and tuple(v.shape) == tuple(model_dict[k].shape)
+ }
+
+ # Weights that do not have match from the pre-trained model.
+ not_load_layers = [
+ k for k in model_dict.keys() if k not in pre_train_dict_match.keys()
+ ]
+ # Log weights that are not loaded with the pre-trained weights.
+ if not_load_layers:
+ for k in not_load_layers:
+ if 'bn.weight' not in k and 'bn.bias' not in k:
+ print("Network weights {} not loaded.".format(k))
+
+ # Load pre-trained weights.
+ model.set_state_dict(pre_train_dict_match)
+
+ if optimizer:
+ assert os.path.exists(ck_path + ".pdopt"), \
+ "Given dir {}.pdopt not exist.".format(ck_path)
+ print("load checkpint from {}.pdopt".format(ck_path))
+ opt_dict = paddle.load(ck_path + ".pdopt")
+ # get parameters that required gradient from re-build model
+ model_key_list = []
+ for param in model.parameters():
+ if param.stop_gradient == False:
+ model_key_list.append(param.name)
+
+ new_opt_dict = mapping_opt_dict(opt_dict, model_key_list)
+ optimizer.set_state_dict(new_opt_dict)
diff --git a/paddlevideo/utils/multigrid/short_sampler.py b/paddlevideo/utils/multigrid/short_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0004dace4f33bee731c9163bf4d2b876413eb952
--- /dev/null
+++ b/paddlevideo/utils/multigrid/short_sampler.py
@@ -0,0 +1,147 @@
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+import math
+
+from paddle.io import BatchSampler
+
+__all__ = ["DistributedShortSampler"]
+
+
+class DistributedShortSampler(BatchSampler):
+ """Sampler that restricts data loading to a subset of the dataset.
+ In such case, each process can pass a DistributedBatchSampler instance
+ as a DataLoader sampler, and load a subset of the original dataset that
+ is exclusive to it.
+ .. note::
+ Batch size is dynamic changed following short cycle schedule.
+
+ Args:
+ dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
+ or other python object which implemented
+ `__len__` for BatchSampler to get sample
+ number of data source.
+ batch_sizes(list): batch size list of one cycle.
+ num_replicas(int, optional): porcess number in distributed training.
+ If :attr:`num_replicas` is None, :attr:`num_replicas` will be
+ retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
+ Default None.
+ rank(int, optional): the rank of the current process among :attr:`num_replicas`
+ processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
+ :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
+ shuffle(bool): whther to shuffle indices order before genrating
+ batch indices. Default False.
+ drop_last(bool): whether drop the last incomplete batch dataset size
+ is not divisible by the batch size. Default False
+ """
+ def __init__(self,
+ dataset,
+ batch_sizes,
+ num_replicas=None,
+ rank=None,
+ shuffle=False,
+ drop_last=False):
+ self.dataset = dataset
+
+ assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \
+ "batch_size should be a positive integer"
+ self.batch_sizes = batch_sizes
+ self.len_batch_sizes = len(self.batch_sizes)
+ assert isinstance(shuffle, bool), \
+ "shuffle should be a boolean value"
+ self.shuffle = shuffle
+ assert isinstance(drop_last, bool), \
+ "drop_last should be a boolean number"
+
+ from paddle.distributed import ParallelEnv
+
+ if num_replicas is not None:
+ assert isinstance(num_replicas, int) and num_replicas > 0, \
+ "num_replicas should be a positive integer"
+ self.nranks = num_replicas
+ else:
+ self.nranks = ParallelEnv().nranks
+
+ if rank is not None:
+ assert isinstance(rank, int) and rank >= 0, \
+ "rank should be a non-negative integer"
+ self.local_rank = rank
+ else:
+ self.local_rank = ParallelEnv().local_rank
+
+ self.drop_last = drop_last
+ self.epoch = 0
+ self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
+ self.total_size = self.num_samples * self.nranks
+
+ def __iter__(self):
+ num_samples = len(self.dataset)
+ indices = np.arange(num_samples).tolist()
+ indices += indices[:(self.total_size -
+ len(indices))] #completion last iter
+ assert len(indices) == self.total_size
+ if self.shuffle:
+ np.random.RandomState(self.epoch).shuffle(indices)
+ self.epoch += 1
+
+ # subsample
+ def _get_indices_by_batch_size(indices):
+ total_batch_size = sum(self.batch_sizes)
+ subsampled_indices = []
+ last_batch_size = self.total_size % (
+ total_batch_size * self.nranks) #number samples of last batch
+ assert last_batch_size % self.nranks == 0
+ last_local_batch_size = last_batch_size // self.nranks
+
+ for i in range(self.local_rank * total_batch_size,
+ len(indices) - last_batch_size,
+ total_batch_size * self.nranks):
+ subsampled_indices.extend(indices[i:i + total_batch_size])
+
+ indices = indices[len(indices) - last_batch_size:]
+ subsampled_indices.extend(
+ indices[self.local_rank *
+ last_local_batch_size:(self.local_rank + 1) *
+ last_local_batch_size])
+ return subsampled_indices
+
+ if self.nranks > 1:
+ indices = _get_indices_by_batch_size(indices)
+
+ assert len(indices) == self.num_samples #index length in each card
+ _sample_iter = iter(indices)
+
+ batch_indices = []
+ counter = 0
+ batch_size = self.batch_sizes[0]
+ for idx in _sample_iter:
+ batch_indices.append(
+ (idx, counter %
+ self.len_batch_sizes)) #to be used in dataloader get_item
+ if len(batch_indices) == batch_size:
+ yield batch_indices
+ counter += 1
+ batch_size = self.batch_sizes[counter % self.len_batch_sizes]
+ batch_indices = []
+ if not self.drop_last and len(batch_indices) > 0:
+ yield batch_indices
+
+ def __len__(self):
+ avg_batch_size = sum(self.batch_sizes) / float(self.len_batch_sizes)
+ if self.drop_last:
+ return int(np.floor(self.num_samples / avg_batch_size))
+ else:
+ return int(np.ceil(self.num_samples / avg_batch_size))
+
+ def set_epoch(self, epoch):
+ """
+ Sets the epoch number. When :attr:`shuffle=True`, this number is used
+ as seeds of random numbers. By default, users may not set this, all
+ replicas (workers) use a different random ordering for each epoch.
+ If set same number at each epoch, this sampler will yield the same
+ ordering at all epoches.
+ Arguments:
+ epoch (int): Epoch number.
+ """
+ self.epoch = epoch
diff --git a/paddlevideo/utils/precise_bn.py b/paddlevideo/utils/precise_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e6660d4c3b23ac95784662fb9d1eaca8aa9aed5
--- /dev/null
+++ b/paddlevideo/utils/precise_bn.py
@@ -0,0 +1,99 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import itertools
+
+from paddlevideo.utils import get_logger
+logger = get_logger("paddlevideo")
+"""
+Implement precise bn, which is useful for improving accuracy.
+"""
+
+
+@paddle.no_grad() # speed up and save CUDA memory
+def do_preciseBN(model,
+ data_loader,
+ parallel,
+ num_iters=200,
+ use_amp=False,
+ amp_level=None):
+ """
+ Recompute and update the batch norm stats to make them more precise. During
+ training both BN stats and the weight are changing after every iteration, so
+ the running average can not precisely reflect the actual stats of the
+ current model.
+ In this function, the BN stats are recomputed with fixed weights, to make
+ the running average more precise. Specifically, it computes the true average
+ of per-batch mean/variance instead of the running average.
+ This is useful to improve validation accuracy.
+ Args:
+ model: the model whose bn stats will be recomputed
+ data_loader: an iterator. Produce data as input to the model
+ num_iters: number of iterations to compute the stats.
+ Return:
+ the model with precise mean and variance in bn layers.
+ """
+ bn_layers_list = [
+ m for m in model.sublayers()
+ if any((isinstance(m, bn_type)
+ for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,
+ paddle.nn.BatchNorm3D))) and m.training
+ ]
+ if len(bn_layers_list) == 0:
+ return
+
+ # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)
+ # we set momentum=0. to get the true mean and variance during forward
+ momentum_actual = [bn._momentum for bn in bn_layers_list]
+ for bn in bn_layers_list:
+ bn._momentum = 0.
+
+ running_mean = [paddle.zeros_like(bn._mean)
+ for bn in bn_layers_list] # pre-ignore
+ running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]
+
+ ind = -1
+ for ind, data in enumerate(itertools.islice(data_loader, num_iters)):
+ logger.info("doing precise BN {} / {}...".format(ind + 1, num_iters))
+
+ if parallel:
+ if use_amp:
+ with paddle.amp.auto_cast(custom_black_list={"reduce_mean"},
+ level=amp_level):
+ model._layers.train_step(data)
+ else:
+ model._layers.train_step(data)
+ else:
+ if use_amp:
+ with paddle.amp.auto_cast(custom_black_list={"reduce_mean"},
+ level=amp_level):
+ model.train_step(data)
+ else:
+ model.train_step(data)
+
+ for i, bn in enumerate(bn_layers_list):
+ # Accumulates the bn stats.
+ running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)
+ running_var[i] += (bn._variance - running_var[i]) / (ind + 1)
+
+ assert ind == num_iters - 1, (
+ "update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations."
+ .format(num_iters, ind))
+
+ # Sets the precise bn stats.
+ for i, bn in enumerate(bn_layers_list):
+ bn._mean.set_value(running_mean[i])
+ bn._variance.set_value(running_var[i])
+ bn._momentum = momentum_actual[i]
diff --git a/paddlevideo/utils/profiler.py b/paddlevideo/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a75018907f203e33df387889ff735c36273a2143
--- /dev/null
+++ b/paddlevideo/utils/profiler.py
@@ -0,0 +1,109 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+ """
+ Use a string to initialize a ProfilerOptions.
+ The string should be in the format: "key1=value1;key2=value;key3=value3".
+ For example:
+ "profile_path=model.profile"
+ "batch_range=[50, 60]; profile_path=model.profile"
+ "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+ ProfilerOptions supports following key-value pair:
+ batch_range - a integer list, e.g. [100, 110].
+ state - a string, the optional values are 'CPU', 'GPU' or 'All'.
+ sorted_key - a string, the optional values are 'calls', 'total',
+ 'max', 'min' or 'ave.
+ tracer_option - a string, the optional values are 'Default', 'OpDetail',
+ 'AllOpDetail'.
+ profile_path - a string, the path to save the serialized profile data,
+ which can be used to generate a timeline.
+ exit_on_finished - a boolean.
+ """
+ def __init__(self, options_str):
+ assert isinstance(options_str, str)
+
+ self._options = {
+ 'batch_range': [10, 20],
+ 'state': 'All',
+ 'sorted_key': 'total',
+ 'tracer_option': 'Default',
+ 'profile_path': '/tmp/profile',
+ 'exit_on_finished': True
+ }
+ self._parse_from_string(options_str)
+
+ def _parse_from_string(self, options_str):
+ for kv in options_str.replace(' ', '').split(';'):
+ key, value = kv.split('=')
+ if key == 'batch_range':
+ value_list = value.replace('[', '').replace(']', '').split(',')
+ value_list = list(map(int, value_list))
+ if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+ 1] > value_list[0]:
+ self._options[key] = value_list
+ elif key == 'exit_on_finished':
+ self._options[key] = value.lower() in ("yes", "true", "t", "1")
+ elif key in [
+ 'state', 'sorted_key', 'tracer_option', 'profile_path'
+ ]:
+ self._options[key] = value
+
+ def __getitem__(self, name):
+ if self._options.get(name, None) is None:
+ raise ValueError(
+ "ProfilerOptions does not have an option named %s." % name)
+ return self._options[name]
+
+
+def add_profiler_step(options_str: str = None) -> None:
+ """Enable the operator-level timing using PaddlePaddle's profiler.
+ The profiler uses a independent variable to count the profiler steps.
+ One call of this function is treated as a profiler step.
+
+ Args:
+ options_str (str, optional): a string to initialize the ProfilerOptions. Defaults to None.
+ """
+ if options_str is None:
+ return
+
+ global _profiler_step_id
+ global _profiler_options
+
+ if _profiler_options is None:
+ _profiler_options = ProfilerOptions(options_str)
+
+ if _profiler_step_id == _profiler_options['batch_range'][0]:
+ paddle.utils.profiler.start_profiler(_profiler_options['state'],
+ _profiler_options['tracer_option'])
+ elif _profiler_step_id == _profiler_options['batch_range'][1]:
+ paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+ _profiler_options['profile_path'])
+ if _profiler_options['exit_on_finished']:
+ sys.exit(0)
+
+ _profiler_step_id += 1
diff --git a/paddlevideo/utils/record.py b/paddlevideo/utils/record.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e35e81810459d0eae868aaac875d0f078cbad4
--- /dev/null
+++ b/paddlevideo/utils/record.py
@@ -0,0 +1,148 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import paddle
+
+from .logger import coloring, get_logger
+
+logger = get_logger("paddlevideo")
+
+__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']
+
+
+def build_record(cfg):
+ record_list = [
+ ("loss", AverageMeter('loss', '7.5f')),
+ ("lr", AverageMeter('lr', 'f', need_avg=False)),
+ ]
+ if 'Recognizer1D' in cfg.framework: #TODO: required specify str in framework
+ record_list.append(("hit_at_one", AverageMeter("hit_at_one", '.5f')))
+ record_list.append(("perr", AverageMeter("perr", '.5f')))
+ record_list.append(("gap", AverageMeter("gap", '.5f')))
+ elif 'Recognizer' in cfg.framework:
+ record_list.append(("top1", AverageMeter("top1", '.5f')))
+ record_list.append(("top5", AverageMeter("top5", '.5f')))
+ elif 'FastRCNN' in cfg.framework:
+ record_list.append(
+ ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')))
+ record_list.append(("prec@thr=0.5", AverageMeter("prec@thr=0.5",
+ '.5f')))
+ record_list.append(("recall@top3", AverageMeter("recall@top3", '.5f')))
+ record_list.append(("prec@top3", AverageMeter("prec@top3", '.5f')))
+ record_list.append(("recall@top5", AverageMeter("recall@top5", '.5f')))
+ record_list.append(("prec@top5", AverageMeter("prec@top5", '.5f')))
+ record_list.append(("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')))
+ elif 'DepthEstimator' in cfg.framework:
+ record_list.append(("abs_rel", AverageMeter("abs_rel", '.5f')))
+ record_list.append(("sq_rel", AverageMeter("sq_rel", '.5f')))
+ record_list.append(("rmse", AverageMeter("rmse", '.5f')))
+ record_list.append(("rmse_log", AverageMeter("rmse_log", '.5f')))
+ record_list.append(("a1", AverageMeter("a1", '.5f')))
+ record_list.append(("a2", AverageMeter("a2", '.5f')))
+ record_list.append(("a3", AverageMeter("a3", '.5f')))
+ record_list.append(("losses_day", AverageMeter("losses_day", '.5f')))
+ record_list.append(("losses_night", AverageMeter("losses_night",
+ '.5f')))
+ elif 'MSTCN' in cfg.framework or 'ASRF' in cfg.framework:
+ record_list.append(("F1@0.50", AverageMeter("F1@0.50", '.5f')))
+
+ record_list.append(("batch_time", AverageMeter('batch_cost', '.5f')))
+ record_list.append(("reader_time", AverageMeter('reader_cost', '.5f')))
+ record_list = OrderedDict(record_list)
+ return record_list
+
+
+class AverageMeter(object):
+ """
+ Computes and stores the average and current value
+ """
+
+ def __init__(self, name='', fmt='f', need_avg=True):
+ self.name = name
+ self.fmt = fmt
+ self.need_avg = need_avg
+ self.reset()
+
+ def reset(self):
+ """ reset """
+ self.val = 0
+ self.avg = 0
+ self.sum = 0
+ self.count = 0
+
+ def update(self, val, n=1):
+ """ update """
+ if isinstance(val, paddle.Tensor):
+ val = val.numpy()[0]
+ self.val = val
+ self.sum += val * n
+ self.count += n
+ self.avg = self.sum / self.count
+
+ @property
+ def total(self):
+ return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)
+
+ @property
+ def total_minute(self):
+ return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,
+ self=self)
+
+ @property
+ def mean(self):
+ return '{self.name}_avg: {self.avg:{self.fmt}}'.format(
+ self=self) if self.need_avg else ''
+
+ @property
+ def value(self):
+ return '{self.name}: {self.val:{self.fmt}}'.format(self=self)
+
+
+def log_batch(metric_list, batch_id, epoch_id, total_epoch, mode, ips):
+ batch_cost = str(metric_list['batch_time'].value) + ' sec,'
+ reader_cost = str(metric_list['reader_time'].value) + ' sec,'
+
+ metric_values = []
+ for m in metric_list:
+ if not (m == 'batch_time' or m == 'reader_time'):
+ metric_values.append(metric_list[m].value)
+ metric_str = ' '.join([str(v) for v in metric_values])
+ epoch_str = "epoch:[{:>3d}/{:<3d}]".format(epoch_id, total_epoch)
+ step_str = "{:s} step:{:<4d}".format(mode, batch_id)
+
+ logger.info("{:s} {:s} {:s} {:s} {:s} {}".format(
+ coloring(epoch_str, "HEADER") if batch_id == 0 else epoch_str,
+ coloring(step_str, "PURPLE"), coloring(metric_str, 'OKGREEN'),
+ coloring(batch_cost, "OKGREEN"), coloring(reader_cost, 'OKGREEN'), ips))
+
+
+def log_epoch(metric_list, epoch, mode, ips):
+ batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'
+ reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'
+ batch_sum = str(metric_list['batch_time'].total) + ' sec,'
+
+ metric_values = []
+ for m in metric_list:
+ if not (m == 'batch_time' or m == 'reader_time'):
+ metric_values.append(metric_list[m].mean)
+ metric_str = ' '.join([str(v) for v in metric_values])
+
+ end_epoch_str = "END epoch:{:<3d}".format(epoch)
+
+ logger.info("{:s} {:s} {:s} {:s} {:s} {:s} {}".format(
+ coloring(end_epoch_str, "RED"), coloring(mode, "PURPLE"),
+ coloring(metric_str, "OKGREEN"), coloring(batch_cost, "OKGREEN"),
+ coloring(reader_cost, "OKGREEN"), coloring(batch_sum, "OKGREEN"), ips))
diff --git a/paddlevideo/utils/registry.py b/paddlevideo/utils/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..81b76bd51f55013cb45f2b923c3e518bfb218d53
--- /dev/null
+++ b/paddlevideo/utils/registry.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Registry(object):
+ """
+ The registry that provides name -> object mapping, to support third-party users' custom modules.
+
+ To register an object:
+
+ .. code-block:: python
+
+ BACKBONES = Registry('backbone')
+ @BACKBONES.register()
+ class ResNet:
+ pass
+ Or:
+ .. code-block:: python
+
+ BACKBONES = Registry('backbone')
+ class ResNet:
+ pass
+ BACKBONES.register(ResNet)
+
+ Usage: To build a module.
+
+ .. code-block:: python
+ backbone_name = "ResNet"
+ b = BACKBONES.get(backbone_name)()
+
+ """
+ def __init__(self, name):
+ """
+ Args:
+ name (str): the name of this registry
+ """
+ self._name = name
+ self._obj_map = {}
+
+ def __contains__(self, key):
+ return self._obj_map.get(key) is not None
+
+ def _do_register(self, name, obj):
+ assert (
+ name not in self._obj_map
+ ), "An object named '{}' was already registered in '{}' registry!".format(
+ name, self._name)
+ self._obj_map[name] = obj
+
+ def register(self, obj=None, name=None):
+ """
+ Register the given object under the the name `obj.__name__`.
+ Can be used as either a decorator or not. See docstring of this class for usage.
+ """
+ if obj is None:
+ # used as a decorator
+ def deco(func_or_class, name=name):
+ if name is None:
+ name = func_or_class.__name__
+ self._do_register(name, func_or_class)
+ return func_or_class
+
+ return deco
+
+ # used as a function call
+ if name is None:
+ name = obj.__name__
+ self._do_register(name, obj)
+
+ def get(self, name):
+ """Get the registry record.
+
+ Args:
+ name (str): The class name.
+
+ Returns:
+ ret: The class.
+ """
+ ret = self._obj_map.get(name)
+ if ret is None:
+ raise KeyError(
+ "No object named '{}' found in '{}' registry!".format(
+ name, self._name))
+
+ return ret
diff --git a/paddlevideo/utils/save_load.py b/paddlevideo/utils/save_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..71465cbb5f5a44d3741eeca287587e9c073714de
--- /dev/null
+++ b/paddlevideo/utils/save_load.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+import time
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger, main_only
+from tqdm import tqdm
+
+
+def pretrain_swin_param_trans(model, state_dicts):
+ # delete classifier's params
+ if 'head.fc' + '.weight' in state_dicts:
+ del state_dicts['head.fc' + '.weight']
+ if 'head.fc' + '.bias' in state_dicts:
+ del state_dicts['head.fc' + '.bias']
+
+ state_dicts = {
+ k.replace('backbone.', ''): v
+ for k, v in state_dicts.items()
+ }
+
+ if len(state_dicts) == len(model.state_dict()):
+ print("Load 3D weights")
+ return state_dicts
+
+ print("Load 2D weights")
+ relative_position_index_keys = [
+ k for k in state_dicts.keys() if "relative_position_index" in k
+ ]
+ for k in relative_position_index_keys:
+ del state_dicts[k]
+
+ # delete attn_mask since we always re-init it
+ attn_mask_keys = [k for k in state_dicts.keys() if "attn_mask" in k]
+ for k in attn_mask_keys:
+ del state_dicts[k]
+
+ state_dicts['patch_embed.proj.weight'] = state_dicts[
+ 'patch_embed.proj.weight'].unsqueeze(2).tile(
+ [1, 1, model.patch_size[0], 1, 1]) / model.patch_size[0]
+
+ # bicubic interpolate relative_position_bias_table if not match
+ relative_position_bias_table_keys = [
+ k for k in state_dicts.keys() if "relative_position_bias_table" in k
+ ]
+ total_len = len(relative_position_bias_table_keys)
+ with tqdm(total=total_len,
+ position=1,
+ bar_format='{desc}',
+ desc="Loading weights") as desc:
+ for key in tqdm(relative_position_bias_table_keys,
+ total=total_len,
+ position=0):
+ relative_position_bias_table_pretrained = state_dicts[key]
+ relative_position_bias_table_current = model.state_dict()[key]
+ L1, nH1 = relative_position_bias_table_pretrained.shape
+ L2, nH2 = relative_position_bias_table_current.shape
+ L2 = (2 * model.window_size[1] - 1) * (2 * model.window_size[2] - 1)
+ wd = model.window_size[0]
+ if nH1 != nH2:
+ desc.set_description(f"Error in loading {key}, skip")
+ else:
+ if L1 != L2:
+ S1 = int(L1**0.5)
+ relative_position_bias_table_pretrained_resized = paddle.nn.functional.interpolate(
+ relative_position_bias_table_pretrained.transpose(
+ [1, 0]).reshape([1, nH1, S1, S1]),
+ size=(2 * model.window_size[1] - 1,
+ 2 * model.window_size[2] - 1),
+ mode='bicubic')
+ relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.reshape(
+ [nH2, L2]).transpose([1, 0])
+ desc.set_description(f"Loading {key}")
+ state_dicts[key] = relative_position_bias_table_pretrained.tile(
+ [2 * wd - 1, 1])
+ time.sleep(0.01)
+ ret_str = "loading {:<20d} weights completed.".format(
+ len(model.state_dict()))
+ desc.set_description(ret_str)
+ return state_dicts
+
+
+def pretrain_vit_param_trans(model, state_dicts, num_patches, num_seg,
+ attention_type):
+ """
+ Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model
+ """
+ if 'head' + '.weight' in state_dicts:
+ del state_dicts['head' + '.weight']
+ if 'head' + '.bias' in state_dicts:
+ del state_dicts['head' + '.bias']
+
+ total_len = len(model.state_dict())
+ if num_patches + 1 != state_dicts['pos_embed'].shape[1]:
+ pos_embed = state_dicts['pos_embed']
+ cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+ other_pos_embed = pos_embed[0,
+ 1:, :].unsqueeze(0).unsqueeze(1).transpose(
+ (0, 1, 3, 2))
+ new_pos_embed = F.interpolate(other_pos_embed,
+ size=(other_pos_embed.shape[-2],
+ num_patches),
+ mode='nearest')
+ new_pos_embed = new_pos_embed.squeeze(0).transpose((0, 2, 1))
+ new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)
+ state_dicts['pos_embed'] = new_pos_embed
+ time.sleep(0.01)
+
+ if 'time_embed' in state_dicts and num_seg != state_dicts[
+ 'time_embed'].shape[1]:
+ time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)
+ new_time_embed = F.interpolate(time_embed,
+ size=(time_embed.shape[-2], num_seg),
+ mode='nearest')
+ state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(
+ (0, 2, 1))
+ time.sleep(0.01)
+ with tqdm(total=total_len,
+ position=1,
+ bar_format='{desc}',
+ desc="Loading weights") as desc:
+ if attention_type == 'divided_space_time':
+ new_state_dicts = state_dicts.copy()
+ for key in tqdm(state_dicts):
+ if 'blocks' in key and 'attn' in key:
+ desc.set_description("Loading %s" % key)
+ new_key = key.replace('attn', 'temporal_attn')
+ if not new_key in state_dicts:
+ new_state_dicts[new_key] = state_dicts[key]
+ else:
+ new_state_dicts[new_key] = state_dicts[new_key]
+ if 'blocks' in key and 'norm1' in key:
+ desc.set_description("Loading %s" % key)
+ new_key = key.replace('norm1', 'temporal_norm1')
+ if not new_key in state_dicts:
+ new_state_dicts[new_key] = state_dicts[key]
+ else:
+ new_state_dicts[new_key] = state_dicts[new_key]
+ time.sleep(0.01)
+ ret_str = "loading {:<20d} weights completed.".format(
+ len(model.state_dict()))
+ desc.set_description(ret_str)
+ return new_state_dicts
+
+
+def pretrain_resnet18_param_trans(model, loaded_dict):
+ encoder_dict = model.encoder.state_dict()
+ pose_encoder_dict = model.pose_encoder.state_dict()
+
+ names = ['encoder.', 'encoder_day.', 'encoder_night.']
+ for name in names:
+ total_len = len(loaded_dict.items())
+ with tqdm(total=total_len,
+ position=1,
+ bar_format='{desc}',
+ desc="Loading weights") as desc:
+ for key, value in tqdm(loaded_dict.items(),
+ total=total_len,
+ position=0):
+ key = str(name + key)
+ if key in encoder_dict:
+ encoder_dict[key] = value
+ desc.set_description('Loading %s' % key)
+ time.sleep(0.01)
+
+ num_input_images = 2
+ loaded_dict['conv1.weight'] = paddle.concat(
+ [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images
+ total_len = len(loaded_dict.items())
+ with tqdm(total=total_len,
+ position=1,
+ bar_format='{desc}',
+ desc="Loading weights") as desc:
+ for name, value in tqdm(loaded_dict.items(),
+ total=total_len,
+ position=0):
+ name = str('encoder.' + name)
+ if name in pose_encoder_dict:
+ pose_encoder_dict[name] = value
+ desc.set_description('Loading %s' % key)
+ time.sleep(0.01)
+ ret_str = "loading {:<20d} weights completed.".format(
+ len(model.state_dict()))
+ desc.set_description(ret_str)
+ return encoder_dict, pose_encoder_dict
+
+
+#XXX(shipping): maybe need load N times because of different cards have different params.
+@main_only
+def load_ckpt(model, weight_path, **kargs):
+ """
+ 1. Load pre-trained model parameters
+ 2. Extract and convert from the pre-trained model to the parameters
+ required by the existing model
+ 3. Load the converted parameters of the existing model
+ """
+ #model.set_state_dict(state_dict)
+
+ if not osp.isfile(weight_path):
+ raise IOError(f'{weight_path} is not a checkpoint file')
+ #state_dicts = load(weight_path)
+
+ logger = get_logger("paddlevideo")
+ state_dicts = paddle.load(weight_path)
+ if 'ResnetEncoder' in str(model):
+ encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans(
+ model, state_dicts)
+ model.encoder.load_dict(encoder_dict)
+ model.pose_encoder.load_dict(pose_encoder_dict)
+ tmp = model.state_dict()
+ elif "VisionTransformer" in str(model): # For TimeSformer case
+ tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],
+ kargs['num_seg'],
+ kargs['attention_type'])
+ elif 'SwinTransformer3D' in str(model):
+ tmp = pretrain_swin_param_trans(model, state_dicts)
+ else:
+ tmp = {}
+ total_len = len(model.state_dict())
+ with tqdm(total=total_len,
+ position=1,
+ bar_format='{desc}',
+ desc="Loading weights") as desc:
+ for item in tqdm(model.state_dict(), total=total_len, position=0):
+ name = item
+ desc.set_description('Loading %s' % name)
+ if name not in state_dicts: # Convert from non-parallel model
+ if str('backbone.' + name) in state_dicts:
+ tmp[name] = state_dicts['backbone.' + name]
+ else: # Convert from parallel model
+ tmp[name] = state_dicts[name]
+ time.sleep(0.01)
+ ret_str = "loading {:<20d} weights completed.".format(
+ len(model.state_dict()))
+ desc.set_description(ret_str)
+ model.set_state_dict(tmp)
+
+
+def mkdir(dir):
+ if not os.path.exists(dir):
+ # avoid error when train with multiple gpus
+ try:
+ os.makedirs(dir)
+ except:
+ pass
+
+
+"""
+def save(state_dicts, file_name):
+ def convert(state_dict):
+ model_dict = {}
+
+ for k, v in state_dict.items():
+ if isinstance(
+ v,
+ (paddle.fluid.framework.Variable, paddle.fluid.core.VarBase)):
+ model_dict[k] = v.numpy()
+ else:
+ model_dict[k] = v
+
+ return model_dict
+
+ final_dict = {}
+ for k, v in state_dicts.items():
+ if isinstance(
+ v,
+ (paddle.fluid.framework.Variable, paddle.fluid.core.VarBase)):
+ final_dict = convert(state_dicts)
+ break
+ elif isinstance(v, dict):
+ final_dict[k] = convert(v)
+ else:
+ final_dict[k] = v
+
+ with open(file_name, 'wb') as f:
+ pickle.dump(final_dict, f, protocol=2)
+"""
+
+
+@main_only
+def save(obj, path):
+ paddle.save(obj, path)
+
+
+def load(file_name):
+ if not osp.isfile(file_name):
+ raise IOError(f'{file_name} not exist')
+ return paddle.load(file_name)
diff --git a/paddlevideo/version.py b/paddlevideo/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b7f481f4afac92604a6b9f036eb93510069556
--- /dev/null
+++ b/paddlevideo/version.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["paddlevideo_version"]
+paddlevideo_version = "0.0.1"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0dbc253ea2e03081455670a19e5e0e75bfab4aa1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+numpy
+pandas
+tqdm
+PyYAML>=5.1
+opencv-python==4.2.0.32
+decord==0.4.2
+av==8.0.3
+scipy==1.6.3
+scikit-image
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0f4fd0ec56c98a10bdb16f912226ae5e736c8756
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,86 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+#export FLAGS_conv_workspace_size_limit=800 #MB
+#export FLAGS_cudnn_exhaustive_search=1
+#export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+
+start_time=$(date +%s)
+
+# run ava training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=logdir.ava_part main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_part.yaml
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=logdir.ava_all.1203 main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_all.yaml
+
+# run adds training
+# python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20
+
+# run tsm training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+
+# run tsm amp training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+
+# run tsm amp training, nhwc
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml
+
+# run tsn training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py --validate -c configs/recognition/tsn/tsn_k400_frames.yaml
+
+# run video-swin-transformer training
+# python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin main.py --amp --validate -c configs/recognition/videoswin/videoswin_k400_videos.yaml
+
+# run slowfast training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast main.py --validate -c configs/recognition/slowfast/slowfast.yaml
+
+# run slowfast multi-grid training
+# python3.7 -B -m paddle.distributed.launch --selected_gpus="0,1,2,3,4,5,6,7" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml
+
+# run bmn training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_bmn main.py --validate -c configs/localization/bmn.yaml
+
+# run attention_lstm training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube-8m.yaml
+
+# run pp-tsm training
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsm main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+
+# run pp-tsn training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml
+
+# run timesformer training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+
+# run pp-timesformer training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml
+
+# run st-gcn training
+# python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml
+
+# run agcn training
+# python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml
+
+# run actbert training
+# python3.7 main.py --validate -c configs/multimodal/actbert/actbert.yaml
+
+# run tsn dali training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml
+
+
+# test.sh
+# just use `example` as example, please replace to real name.
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_test main.py --test -c configs/example.yaml -w "output/example/example_best.pdparams"
+
+# NOTE: run bmn test, only support single card, bs=1
+# python3.7 main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00010.pdparams -o DATASET.batch_size=1
+
+# export_models script
+# just use `example` as example, please replace to real name.
+# python3.7 tools/export_model.py -c configs/example.yaml -p output/example/example_best.pdparams -o ./inference
+
+# predict script
+# just use `example` as example, please replace to real name.
+# python3.7 tools/predict.py -v example.avi --model_file "./inference/example.pdmodel" --params_file "./inference/example.pdiparams" --enable_benchmark=False --model="example" --num_seg=8
+
+end_time=$(date +%s)
+cost_time=$[ $end_time-$start_time ]
+echo "Time to train is $(($cost_time/60))min $(($cost_time%60))s"
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5f3e85ffcebdec9339ff32c26dfa2870c76926a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from setuptools import setup
+from io import open
+
+with open('requirements.txt', encoding="utf-8-sig") as f:
+ requirements = f.readlines()
+
+def readme():
+ with open('docs/en/whl_en.md', encoding="utf-8-sig") as f:
+ README = f.read()
+ return README
+
+
+setup(
+ name='paddlevideo', #name of .whl file
+ packages=['ppvideo'], #install package name
+ package_dir={'ppvideo': ''},
+ include_package_data=True, #Accept all data files and directories matched by MANIFEST.in
+ install_requires=requirements,
+ entry_points={"console_scripts": ["ppvideo= ppvideo.tools.paddlevideo_clas:main"]},
+ version='0.0.1',
+ license='Apache License 2.0',
+ description='Awesome Video toolkits based on PaddlePaddle ',
+ long_description=readme(),
+ long_description_content_type='text/markdown',
+ url='https://github.com/PaddlePaddle/PaddleVideo',
+ download_url='https://github.com/PaddlePaddle/PaddleVideo.git',
+ keywords=[
+ 'A treasure chest for video understanding powered by PaddlePaddle.'
+ ],
+ classifiers=[
+ 'Intended Audience :: Developers', 'Operating System :: OS Independent',
+ 'Natural Language :: Chinese (Simplified)',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.2',
+ 'Programming Language :: Python :: 3.3',
+ 'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7', 'Topic :: Utilities'
+ ],)
\ No newline at end of file
diff --git a/test_tipc/README.md b/test_tipc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e80c6e9c562035e1ad3ba7b8e76195868de84e32
--- /dev/null
+++ b/test_tipc/README.md
@@ -0,0 +1,122 @@
+
+# 飞桨训推一体认证(TIPC)
+
+## 1. 简介
+
+飞桨除了基本的模型训练和预测,还提供了支持多端多平台的高性能推理部署工具。本文档提供了PaddleVideo中所有模型的飞桨训推一体认证 (Training and Inference Pipeline Certification(TIPC)) 信息和测试工具,方便用户查阅每种模型的训练推理部署打通情况,并可以进行一键测试。
+
+
+

+
+
+## 2. 汇总信息
+
+打通情况汇总如下,已填写的部分表示可以使用本工具进行一键测试,未填写的表示正在支持中。
+
+**字段说明:**
+- 基础训练预测:包括模型训练、Paddle Inference Python预测。
+- 更多训练方式:包括多机多卡(TODO)、混合精度。
+- 模型压缩(TODO):包括裁剪、离线/在线量化、蒸馏。
+- 其他预测部署:包括Paddle Inference C++预测、Paddle Serving部署(TODO)、Paddle-Lite部署(TODO)等。
+
+更详细的mkldnn、Tensorrt等预测加速相关功能的支持情况可以查看各测试工具的[更多教程](#more)。
+
+| 算法名称 | 模型名称 | 模型类型 | 基础
训练预测 | 更多
训练方式 | 模型压缩 | 其他预测部署 |
+| :--- | :--- | :----: | :--------: | :---- | :---- | :---- |
+| PP-TSM |pptsm_k400_frames_uniform | 动作识别 | 支持 | 混合精度 | - | Paddle Inference: C++ |
+| PP-TSN |pptsn_k400_videos | 动作识别 | 支持 | 混合精度 | - | Paddle Inference: C++ |
+| AGCN |agcn_fsd | 动作识别 | 支持 | 混合精度 | - | - |
+| STGCN |stgcn_fsd | 动作识别 | 支持 | 混合精度 | - | - |
+| TimeSformer |timesformer_k400_videos | 动作识别 | 支持 | 混合精度 | - | - |
+| SlowFast |slowfast | 动作识别 | 支持 | 混合精度 | - | - |
+| TSM |tsm_k400_frames | 动作识别 | 支持 | 混合精度 | - | - |
+| TSN |tsn_k400_frames | 动作识别 |支持|混合精度|-|-|
+| AttentionLSTM |attention_lstm_youtube8m | 动作识别 | 支持 | 混合精度 | - | - |
+| BMN |bmn | 动作时间定位 | 支持 | 混合精度 | - | - |
+
+
+
+## 3. 测试工具简介
+### 目录介绍
+
+```shell
+test_tipc/
+├── configs/ # 配置文件目录
+│ ├── PP-TSM/
+│ │ ├── train_infer_python.txt # PP-TSM在Linux上进行python训练预测(基础训练预测)的配置文件
+│ │ └── train_amp_infer_python.txt # PP-TSM在Linux上进行python训练预测(混合精度训练预测)的配置文件
+│ ├── PP-TSN/
+│ │ ├── train_infer_python.txt # PP-TSN在Linux上进行python训练预测(基础训练预测)的配置文件
+│ │ └── train_amp_infer_python.txt # PP-TSN在Linux上进行python训练预测(混合精度训练预测)的配置文件
+│ ├── ...
+│ └── ...
+├── results/ # 预先保存的预测结果,用于和实际预测结果进行精度比对
+│ ├── PP-TSM/
+│ │ ├── python_ppvideo_PP-TSM_results_fp16.txt # 预存的PP-TSM识别识别模型python预测fp16精度的结果
+│ │ └── python_ppvideo_PP-TSM_results_fp32.txt # 预存的PP-TSM识别识别模型python预测fp32精度的结果
+│ ├── PP-TSN/
+│ │ ├── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型python预测fp16精度的结果
+│ │ └── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型python预测fp32精度的结果
+│ ├── PP-TSN_CPP/
+│ │ ├── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型C++预测fp16精度的结果
+│ │ └── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型C++预测fp32精度的结果
+│ ├── ...
+│ └── ...
+├── prepare.sh # 完成test_*.sh运行所需要的数据和模型下载
+├── docs/ # 详细的TIPC各种功能文档
+├── test_train_inference_python.sh # 测试python训练预测的主程序
+├── test_inference_cpp.sh # 测试C++预测的主程序
+├── compare_results.py # 用于对比log中的预测结果与results中的预存结果精度误差是否在限定范围内
+└── README.md # 介绍文档
+```
+
+### 测试流程概述
+
+使用本工具,可以测试不同功能的支持情况,以及预测结果是否对齐,测试流程概括如下:
+
+

+
+
+
+1. 运行prepare.sh准备测试所需数据和模型;
+2. 运行要测试的功能对应的测试脚本`test_*.sh`,产出log,由log可以看到不同配置是否运行成功;
+3. 用`compare_results.py`对比log中的预测结果和预存在results目录下的结果,判断预测精度是否符合预期(在误差范围内)。
+
+测试单项功能仅需两行命令,**如需测试不同模型/功能,替换配置文件即可**,命令格式如下:
+```shell
+# 功能:准备数据
+# 格式:bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择
+bash test_tipc/prepare.sh configs/[model_name]/[params_file_name] [Mode]
+
+# 功能:运行测试
+# 格式:bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择
+bash test_tipc/test_train_inference_python.sh configs/[model_name]/[params_file_name] [Mode]
+```
+
+例如,测试基本训练预测功能的`lite_train_lite_infer`模式,运行:
+```shell
+# 准备数据
+bash test_tipc/prepare.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'
+# 运行测试
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'
+```
+关于本示例命令的更多信息可查看[基础训练预测使用文档](./docs/test_train_inference_python.md)。
+
+### 配置文件命名规范
+在`configs`目录下存放所有模型测试需要用到的配置文件,配置文件的命名遵循如下规范:
+
+1. 基础训练预测配置简单命名为:`train_infer_python.txt`,表示**Linux环境下单机、不使用混合精度训练+python预测**,其完整命名对应`train_linux_gpu_normal_normal_infer_python_linux_gpu_cpu.txt`,由于本配置文件使用频率较高,这里进行了名称简化。
+
+2. 其他带训练配置命名格式为:`train_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_预测模式(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`。如,linux gpu下多机多卡+混合精度链条测试对应配置 `train_linux_gpu_fleet_amp_infer_python_linux_gpu_cpu.txt`,linux dcu下基础训练预测对应配置 `train_linux_dcu_normal_normal_infer_python_linux_dcu.txt`。
+
+3. 仅预测的配置(如serving、lite等)命名格式:`model_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`,即,与2相比,仅第一个字段从train换为model,测试时模型直接下载获取,这里的“训练硬件环境”表示所测试的模型是在哪种环境下训练得到的。
+
+**根据上述命名规范,可以直接从子目录名称和配置文件名找到需要测试的场景和功能对应的配置文件。**
+
+
+
+## 4. 开始测试
+各功能测试中涉及混合精度、裁剪、量化等训练相关,及mkldnn、Tensorrt等多种预测相关参数配置,请点击下方相应链接了解更多细节和使用教程:
+- [test_train_inference_python 使用](docs/test_train_inference_python.md) :测试基于Python的模型训练、评估、推理等基本功能。
+- [test_amp_train_inference_python 使用](docs/test_train_amp_inference_python.md) :测试基于Python的**混合精度**模型训练、评估、推理等基本功能。
+- [test_inference_cpp 使用](docs/test_inference_cpp.md) :测试基于C++的模型推理功能。
diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..74d77405deab0a92961ada491c9dd92dd7996120
--- /dev/null
+++ b/test_tipc/benchmark_train.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+# set env
+python=python
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}')
+export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)
+
+# BENCHMARK_ROOT='.' # only for self-test
+
+# run benchmark sh
+# Usage:
+# bash run_benchmark_train.sh config.txt params
+# or
+# bash run_benchmark_train.sh config.txt
+
+function func_parser_params(){
+ strs=$1
+ IFS="="
+ array=(${strs})
+ tmp=${array[1]}
+ echo ${tmp}
+}
+
+function func_sed_params(){
+ filename=$1
+ line=$2
+ param_value=$3
+ params=`sed -n "${line}p" $filename`
+ IFS=":"
+ array=(${params})
+ key=${array[0]}
+ value=${array[1]}
+ if [[ $value =~ 'benchmark_train' ]];then
+ IFS='='
+ _val=(${value})
+ param_value="${param_value}"
+ fi
+ new_params="${key}:${param_value}"
+ IFS=";"
+ cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
+ eval $cmd
+}
+
+function set_gpu_id(){
+ string=$1
+ _str=${string:1:6}
+ IFS="C"
+ arr=(${_str})
+ M=${arr[0]}
+ P=${arr[1]}
+ gn=`expr $P - 1`
+ gpu_num=`expr $gn / $M`
+ seq=`seq -s "," 0 $gpu_num`
+ echo $seq
+}
+
+function get_repo_name(){
+ IFS=";"
+ cur_dir=$(pwd)
+ IFS="/"
+ arr=(${cur_dir})
+ echo ${arr[-1]}
+}
+
+FILENAME=$1
+# copy FILENAME as new
+new_filename="./test_tipc/benchmark_train.txt"
+cmd=`yes|cp $FILENAME $new_filename`
+FILENAME=$new_filename
+# MODE must be one of ['benchmark_train']
+MODE=$2
+PARAMS=$3
+# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1
+IFS=$'\n'
+# parser params from train_benchmark.txt
+dataline=`cat $FILENAME`
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+
+# 获取'train_benchmark_params'所在的行数
+line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1`
+# for train log parser
+batch_size=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+fp_items=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+epoch=$(func_parser_value "${lines[line_num]}")
+
+line_num=`expr $line_num + 1`
+profile_option_key=$(func_parser_key "${lines[line_num]}")
+profile_option_params=$(func_parser_value "${lines[line_num]}")
+profile_option="${profile_option_key}:${profile_option_params}"
+
+line_num=`expr $line_num + 1`
+flags_value=$(func_parser_value "${lines[line_num]}")
+# set flags
+IFS=";"
+flags_list=(${flags_value})
+for _flag in ${flags_list[*]}; do
+ cmd="export ${_flag}"
+ eval $cmd
+done
+
+# set log_name
+repo_name=$(get_repo_name )
+SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log
+mkdir -p "${SAVE_LOG}/benchmark_log/"
+status_log="${SAVE_LOG}/benchmark_log/results.log"
+
+# The number of lines in which train params can be replaced.
+line_python=3
+line_gpuid=4
+line_precision=6
+line_epoch=7
+line_batchsize=9
+line_profile=12
+line_eval_py=24
+line_eval_py_2=25
+line_export_py=38
+line_export_py_2=28
+line_export_py_3=30
+line_norm_train=16
+
+func_sed_params "$FILENAME" "${line_eval_py}" "null"
+func_sed_params "$FILENAME" "${line_eval_py_2}" "null"
+func_sed_params "$FILENAME" "${line_export_py}" "null"
+func_sed_params "$FILENAME" "${line_export_py_2}" "null"
+func_sed_params "$FILENAME" "${line_export_py_3}" "null"
+func_sed_params "$FILENAME" "${line_python}" "$python"
+
+# 末尾加上--max_iters=30和--log_interval=1,以便运行并输出足量数据
+set_log_interval_cmd="sed -i '${line_norm_train}s/.*/& --max_iters=30 -o log_interval=1/' '${filename}'"
+eval $set_log_interval_cmd
+
+# 去掉--validate,benchmark不需要validate
+remove_validate_cmd="sed -i '${line_norm_train}s/--validate//' '${filename}'"
+eval $remove_validate_cmd
+
+# if params
+if [ ! -n "$PARAMS" ] ;then
+ # PARAMS input is not a word.
+ IFS="|"
+ batch_size_list=(${batch_size})
+ fp_items_list=(${fp_items})
+ device_num_list=(N1C4)
+ run_mode="DP"
+else
+ # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
+ IFS="_"
+ params_list=(${PARAMS})
+ model_type=${params_list[0]}
+ batch_size=${params_list[1]}
+ batch_size=`echo ${batch_size} | tr -cd "[0-9]" `
+ precision=${params_list[2]}
+ run_mode=${params_list[3]}
+ device_num=${params_list[4]}
+ IFS=";"
+
+ if [ ${precision} = "null" ];then
+ precision="fp32"
+ fi
+
+ fp_items_list=($precision)
+ batch_size_list=($batch_size)
+ device_num_list=($device_num)
+fi
+
+log_interval='--log_interval 1'
+IFS="|"
+for batch_size in ${batch_size_list[*]}; do
+ for precision in ${fp_items_list[*]}; do
+ for device_num in ${device_num_list[*]}; do
+ # sed batchsize and precision
+ func_sed_params "$FILENAME" "${line_precision}" "$precision"
+ func_sed_params "$FILENAME" "${line_batchsize}" "$batch_size"
+ func_sed_params "$FILENAME" "${line_epoch}" "$epoch"
+ gpu_id=$(set_gpu_id $device_num)
+
+ if [ ${#gpu_id} -le 1 ];then
+ log_path="$SAVE_LOG/profiling_log"
+ mkdir -p $log_path
+ log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling"
+ func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id
+ # set profile_option params
+ tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
+
+ # for models which need to accumulate gradient.
+ if [[ ${model_name} =~ "TimeSformer" ]]; then
+ global_bs=`expr ${batch_size} \* ${device_num:3:4} \* 8`
+ modify_global_bs_cmd="sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'"
+ eval $modify_global_bs_cmd
+ fi
+
+ # run test_train_inference_python.sh
+ cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+ echo $cmd
+ eval $cmd
+ eval "cat ${log_path}/${log_name}"
+
+ # without profile
+ log_path="$SAVE_LOG/train_log"
+ speed_log_path="$SAVE_LOG/index"
+ mkdir -p $log_path
+ mkdir -p $speed_log_path
+ log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
+ speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
+ func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null
+
+ cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+ echo $cmd
+ job_bt=`date '+%Y%m%d%H%M%S'`
+ eval $cmd
+ job_et=`date '+%Y%m%d%H%M%S'`
+ export model_run_time=$((${job_et}-${job_bt}))
+ eval "cat ${log_path}/${log_name}"
+
+ # parser log
+ _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+ cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+ --speed_log_file '${speed_log_path}/${speed_log_name}' \
+ --model_name ${_model_name} \
+ --base_batch_size ${batch_size} \
+ --run_mode ${run_mode} \
+ --fp_item ${precision} \
+ --keyword ips: \
+ --skip_steps 5 \
+ --device_num ${device_num} \
+ --speed_unit instance/sec \
+ --convergence_key loss: "
+ echo $cmd
+ eval $cmd
+ last_status=${PIPESTATUS[0]}
+ status_check $last_status "${cmd}" "${status_log}"
+ else
+ IFS=";"
+ unset_env=`unset CUDA_VISIBLE_DEVICES`
+ log_path="$SAVE_LOG/train_log"
+ speed_log_path="$SAVE_LOG/index"
+ mkdir -p $log_path
+ mkdir -p $speed_log_path
+ log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
+ speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
+ func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id
+ func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null
+
+ # for models which need to accumulate gradient.
+ if [[ ${model_name} =~ "TimeSformer" ]]; then
+ global_bs=`expr ${batch_size} \* ${device_num:3:4} \* 8`
+ modify_global_bs_cmd="sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'"
+ eval $modify_global_bs_cmd
+ fi
+
+ cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+ echo $cmd
+ job_bt=`date '+%Y%m%d%H%M%S'`
+ eval $cmd
+ job_et=`date '+%Y%m%d%H%M%S'`
+ export model_run_time=$((${job_et}-${job_bt}))
+ eval "cat ${log_path}/${log_name}"
+ # parser log
+ _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+ cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+ --speed_log_file '${speed_log_path}/${speed_log_name}' \
+ --model_name ${_model_name} \
+ --base_batch_size ${batch_size} \
+ --run_mode ${run_mode} \
+ --fp_item ${precision} \
+ --keyword ips: \
+ --skip_steps 5 \
+ --device_num ${device_num} \
+ --speed_unit instance/sec \
+ --convergence_key loss: "
+ echo $cmd
+ eval $cmd
+ last_status=${PIPESTATUS[0]}
+ status_check $last_status "${cmd}" "${status_log}"
+ fi
+ done
+ done
+done
diff --git a/test_tipc/common_func.sh b/test_tipc/common_func.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2c039e9642e8c125d1e73981e1d97e723b15a9ea
--- /dev/null
+++ b/test_tipc/common_func.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+function func_parser_key(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ tmp=${array[0]}
+ echo ${tmp}
+}
+
+function func_parser_value(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ tmp=${array[1]}
+ echo ${tmp}
+}
+
+function func_set_params(){
+ key=$1
+ value=$2
+ if [ ${key}x = "null"x ];then
+ echo " "
+ elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
+ echo " "
+ else
+ echo "${key}=${value}"
+ fi
+}
+
+function func_parser_params(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ key=${array[0]}
+ tmp=${array[1]}
+ IFS="|"
+ res=""
+ for _params in ${tmp[*]}; do
+ IFS="="
+ array=(${_params})
+ mode=${array[0]}
+ value=${array[1]}
+ if [[ ${mode} = ${MODE} ]]; then
+ IFS="|"
+ #echo $(func_set_params "${mode}" "${value}")
+ echo $value
+ break
+ fi
+ IFS="|"
+ done
+ echo ${res}
+}
+
+function status_check(){
+ last_status=$1 # the exit code
+ run_command=$2
+ run_log=$3
+ if [ $last_status -eq 0 ]; then
+ echo -e "\033[33m Run successfully with command - ${run_command}! \033[0m" | tee -a ${run_log}
+ else
+ echo -e "\033[33m Run failed with command - ${run_command}! \033[0m" | tee -a ${run_log}
+ fi
+}
diff --git a/test_tipc/compare_results.py b/test_tipc/compare_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd8308dc9a00634c26abc0324cec12721f2da69c
--- /dev/null
+++ b/test_tipc/compare_results.py
@@ -0,0 +1,171 @@
+import numpy as np
+import os
+import subprocess
+import json
+import argparse
+import glob
+
+
+def init_args():
+ parser = argparse.ArgumentParser()
+ # params for testing assert allclose
+ parser.add_argument("--atol", type=float, default=1e-3)
+ parser.add_argument("--rtol", type=float, default=1e-3)
+ parser.add_argument("--gt_file", type=str, default="")
+ parser.add_argument("--log_file", type=str, default="")
+ parser.add_argument("--precision", type=str, default="fp32")
+ return parser
+
+
+def parse_args():
+ parser = init_args()
+ return parser.parse_args()
+
+
+def run_shell_command(cmd):
+ p = subprocess.Popen(cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ shell=True)
+ out, err = p.communicate()
+
+ if p.returncode == 0:
+ return out.decode('utf-8')
+ else:
+ return None
+
+
+def parser_results_from_log_by_name(log_path, names_list):
+ if not os.path.exists(log_path):
+ raise ValueError("The log file {} does not exists!".format(log_path))
+
+ if names_list is None or len(names_list) < 1:
+ return []
+
+ parser_results = {}
+ lines = open(log_path, 'r').read().splitlines()
+ if 'python_infer' in log_path: # parse python inference
+ for line in lines:
+ split_items = line.replace('\t', ' ')
+ split_items = split_items.split(' ')
+ split_items = [item for item in split_items if len(item) > 0]
+ for name in names_list:
+ if name in line:
+ if '.' in split_items[-1]:
+ parser_results[name] = float(split_items[-1])
+ else:
+ parser_results[name] = int(split_items[-1])
+ else: # parse cpp inference
+ for line in lines:
+ split_items = line.replace('\t', ' ')
+ split_items = split_items.split(' ')
+ split_items = [item for item in split_items if len(item) > 0]
+ if all([(name + ':') in split_items for name in names_list]):
+ # print(split_items)
+ parser_results['class'] = int(split_items[2])
+ parser_results['score'] = float(split_items[-1])
+ return parser_results
+
+
+def load_gt_from_file(gt_file):
+ if not os.path.exists(gt_file):
+ raise ValueError("The log file {} does not exists!".format(gt_file))
+ with open(gt_file, 'r') as f:
+ data = f.readlines()
+ f.close()
+ parser_gt = {}
+ for line in data:
+ if 'top-1 class' in line:
+ split_items = line.replace('\t', ' ')
+ split_items = split_items.split(' ')
+ split_items = [item for item in split_items if len(item) > 0]
+ parser_gt['top-1 class'] = int(split_items[-1])
+ elif 'top-1 score' in line:
+ split_items = line.replace('\t', ' ')
+ split_items = split_items.split(' ')
+ split_items = [item for item in split_items if len(item) > 0]
+ parser_gt['top-1 score'] = float(split_items[-1])
+ elif "score" in line and 'segment' in line:
+ location_dict = eval(line)
+ parser_gt[f"score_{len(parser_gt)}"] = location_dict['score']
+ parser_gt[f"segment_{len(parser_gt)}"] = location_dict['segment']
+ elif "class:" in line and "score:" in line:
+ split_items = line.replace('\t', ' ')
+ split_items = split_items.split(' ')
+ split_items = [item for item in split_items if len(item) > 0]
+ parser_gt['class'] = int(split_items[2])
+ parser_gt['score'] = float(split_items[-1])
+ return parser_gt
+
+
+def load_gt_from_txts(gt_file):
+ gt_list = glob.glob(gt_file)
+ gt_collection = {}
+ for gt_f in gt_list:
+ gt_dict = load_gt_from_file(gt_f)
+ basename = os.path.basename(gt_f)
+ if "fp32" in basename:
+ gt_collection["fp32"] = [gt_dict, gt_f]
+ elif "fp16" in basename:
+ gt_collection["fp16"] = [gt_dict, gt_f]
+ elif "int8" in basename:
+ gt_collection["int8"] = [gt_dict, gt_f]
+ else:
+ continue
+ return gt_collection
+
+
+def collect_predict_from_logs(log_path, key_list):
+ log_list = glob.glob(log_path)
+ pred_collection = {}
+ for log_f in log_list:
+ pred_dict = parser_results_from_log_by_name(log_f, key_list)
+ key = os.path.basename(log_f)
+ pred_collection[key] = pred_dict
+
+ return pred_collection
+
+
+def testing_assert_allclose(dict_x, dict_y, atol=1e-7, rtol=1e-7):
+ for k in dict_x:
+ np.testing.assert_allclose(np.array(dict_x[k]),
+ np.array(dict_y[k]),
+ atol=atol,
+ rtol=rtol)
+
+
+if __name__ == "__main__":
+ # Usage example:
+ # test python infer:
+ ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM/*.txt --log_file=./test_tipc/output/PP-TSM/python_infer_*.log
+ # test cpp infer:
+ ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM_CPP/*.txt --log_file=./test_tipc/output/PP-TSM_CPP/cpp_infer_*.log
+
+ args = parse_args()
+
+ gt_collection = load_gt_from_txts(args.gt_file)
+ key_list = gt_collection["fp32"][0].keys()
+ pred_collection = collect_predict_from_logs(args.log_file, key_list)
+ for filename in pred_collection.keys():
+ if "fp32" in filename:
+ gt_dict, gt_filename = gt_collection["fp32"]
+ elif "fp16" in filename:
+ gt_dict, gt_filename = gt_collection["fp16"]
+ elif "int8" in filename:
+ gt_dict, gt_filename = gt_collection["int8"]
+ else:
+ continue
+ pred_dict = pred_collection[filename]
+ try:
+ testing_assert_allclose(gt_dict,
+ pred_dict,
+ atol=args.atol,
+ rtol=args.rtol)
+ print(
+ "Assert allclose passed! The results of {} and {} are consistent!"
+ .format(filename, gt_filename))
+ except Exception as E:
+ print(E)
+ raise ValueError(
+ "The results of {} and the results of {} are inconsistent!".
+ format(filename, gt_filename))
diff --git a/test_tipc/configs/AGCN/train_amp_infer_python.txt b/test_tipc/configs/AGCN/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0dc3a48c53e01fb861f5e292b1e586763d4925c3
--- /dev/null
+++ b/test_tipc/configs/AGCN/train_amp_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:AGCN
+python:python3.7
+gpu_list:0
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+null:null
+train_model_name:null
+train_infer_video_dir:null
+null:null
+##
+trainer:amp_train
+norm_train:main.py -c configs/recognition/agcn/agcn_fsd.yaml --seed 1234 -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' -c configs/recognition/agcn/agcn_fsd.yaml --seed 1234 -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/agcn/agcn_fsd.yaml -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+-w:./test_tipc/output/AGCN/AGCN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/AGCN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/AGCN_fsd.pdparams
+infer_export:tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/agcn/agcn_fsd.yaml
+--use_gpu:True|False
+--enable_mkldnn:False|True
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/fsd10/example_skeleton.npy
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[2, 350, 25, 1]}]
diff --git a/test_tipc/configs/AGCN/train_infer_python.txt b/test_tipc/configs/AGCN/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..53497191b2fc9a9ed6c755ea0518fb26254ba6ae
--- /dev/null
+++ b/test_tipc/configs/AGCN/train_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:AGCN
+python:python3.7
+gpu_list:0
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+null:null
+train_model_name:null
+train_infer_video_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:main.py -c configs/recognition/agcn/agcn_fsd.yaml --seed 1234 -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/agcn/agcn_fsd.yaml -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+-w:./test_tipc/output/AGCN/AGCN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/AGCN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/AGCN_fsd.pdparams
+infer_export:tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/agcn/agcn_fsd.yaml
+--use_gpu:True|False
+--enable_mkldnn:False|True
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/fsd10/example_skeleton.npy
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[2, 350, 25, 1]}]
diff --git a/test_tipc/configs/AttentionLSTM/train_amp_infer_python.txt b/test_tipc/configs/AttentionLSTM/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..01ff8a23dc9a14d6c5fce92054be3c609fe736bb
--- /dev/null
+++ b/test_tipc/configs/AttentionLSTM/train_amp_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:AttentionLSTM
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:64
+null:null
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/yt8m/train_small.list' -o DATASET.valid.file_path='data/yt8m/train_small.list' -o DATASET.test.file_path='data/yt8m/train_small.list'
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+-w:./test_tipc/output/AttentionLSTM/AttentionLSTM_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/AttentionLSTM
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/AttentionLSTM_yt8.pdparams
+infer_export:tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.pkl
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[512, 1024]},{int32,[1]},{float32,[512, 1024]},{float32,[512, 128]},{int32,[1]},{float32,[512, 128]}]
diff --git a/test_tipc/configs/AttentionLSTM/train_infer_python.txt b/test_tipc/configs/AttentionLSTM/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0c43e5e61e168a6ce61903f8dcb55ff9cbdf0672
--- /dev/null
+++ b/test_tipc/configs/AttentionLSTM/train_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:AttentionLSTM
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:64
+null:null
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/yt8m/train_small.list' -o DATASET.valid.file_path='data/yt8m/train_small.list' -o DATASET.test.file_path='data/yt8m/train_small.list'
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+-w:./test_tipc/output/AttentionLSTM/AttentionLSTM_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/AttentionLSTM
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/AttentionLSTM_yt8.pdparams
+infer_export:tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.pkl
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[512, 1024]},{int32,[1]},{float32,[512, 1024]},{float32,[512, 128]},{int32,[1]},{float32,[512, 128]}]
diff --git a/test_tipc/configs/BMN/train_amp_infer_python.txt b/test_tipc/configs/BMN/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9c02b86e32c0caa603b33fd2ed5a05f4235961a
--- /dev/null
+++ b/test_tipc/configs/BMN/train_amp_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:BMN
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:16
+-o MODEL.backbone.pretrained:null
+train_model_name:null
+--profiler_options:null
+-o DATASET.train.file_path:null
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/localization/bmn.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/localization/bmn.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/localization/bmn.yaml
+-w:./test_tipc/output/BMN/BMN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/BMN
+-p:null
+norm_export:tools/export_model.py -c configs/localization/bmn.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/BMN.pdparams
+infer_export:tools/export_model.py -c configs/localization/bmn.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/localization/bmn.yaml
+--use_gpu:True|False
+--enable_mkldnn:False
+--cpu_threads:1|6
+--batch_size:1
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example_feat.list
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[400, 100]}]
diff --git a/test_tipc/configs/BMN/train_infer_python.txt b/test_tipc/configs/BMN/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..28b49a716050dc61ae5de4048150a804cb272087
--- /dev/null
+++ b/test_tipc/configs/BMN/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:BMN
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:16
+-o MODEL.backbone.pretrained:null
+train_model_name:null
+--profiler_options:null
+-o DATASET.train.file_path:null
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/localization/bmn.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/localization/bmn.yaml
+-w:./test_tipc/output/BMN/BMN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/BMN
+-p:null
+norm_export:tools/export_model.py -c configs/localization/bmn.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/BMN.pdparams
+infer_export:tools/export_model.py -c configs/localization/bmn.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/localization/bmn.yaml
+--use_gpu:True|False
+--enable_mkldnn:False
+--cpu_threads:1|6
+--batch_size:1
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example_feat.list
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[400, 100]}]
diff --git a/test_tipc/configs/PP-TSM/infer_cpp.txt b/test_tipc/configs/PP-TSM/infer_cpp.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ef65b19e5ecfafe0fa0a8d9a7908262c3cdc1421
--- /dev/null
+++ b/test_tipc/configs/PP-TSM/infer_cpp.txt
@@ -0,0 +1,18 @@
+===========================cpp_infer_params===========================
+model_name:PP-TSM
+use_opencv:True
+infer_model:./inference/ppTSM
+infer_quant:False
+inference:./deploy/cpp_infer/build/ppvideo rec
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--rec_batch_num:1
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--rec_model_dir:
+--video_dir:./deploy/cpp_infer/example_video_dir
+--inference_model_name:ppTSM
+--benchmark:True
+--char_list_file:data/k400/Kinetics-400_label_list.txt
+--num_seg:8
diff --git a/test_tipc/configs/PP-TSM/train_amp_infer_python.txt b/test_tipc/configs/PP-TSM/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d8a8c2f4944b4b143d43a39d8f7fe42095493b56
--- /dev/null
+++ b/test_tipc/configs/PP-TSM/train_amp_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:PP-TSM
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_vd_ssld_v2_pretrained.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_frames.list' -o DATASET.valid.file_path='data/k400/val_small_frames.list' -o DATASET.test.file_path='data/k400/val_small_frames.list'
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+-w:./test_tipc/output/ppTSM/ppTSM_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/ppTSM
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/ppTSM_k400_uniform.pdparams
+infer_export:tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[8, 3, 224, 224]}]
diff --git a/test_tipc/configs/PP-TSM/train_infer_python.txt b/test_tipc/configs/PP-TSM/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..45691f69abf41a1d39e71d24fe0c3989836eece0
--- /dev/null
+++ b/test_tipc/configs/PP-TSM/train_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:PP-TSM
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_vd_ssld_v2_pretrained.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_frames.list' -o DATASET.valid.file_path='data/k400/val_small_frames.list' -o DATASET.test.file_path='data/k400/val_small_frames.list'
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+-w:./test_tipc/output/ppTSM/ppTSM_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/ppTSM
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/ppTSM_k400_uniform.pdparams
+infer_export:tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[8, 3, 224, 224]}]
diff --git a/test_tipc/configs/PP-TSN/infer_cpp.txt b/test_tipc/configs/PP-TSN/infer_cpp.txt
new file mode 100644
index 0000000000000000000000000000000000000000..32da79cde4cdc49ab9094d339e2840e61fcc76e1
--- /dev/null
+++ b/test_tipc/configs/PP-TSN/infer_cpp.txt
@@ -0,0 +1,18 @@
+===========================cpp_infer_params===========================
+model_name:PP-TSN
+use_opencv:True
+infer_model:./inference/ppTSN
+infer_quant:False
+inference:./deploy/cpp_infer/build/ppvideo rec
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--rec_batch_num:1
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--rec_model_dir:
+--video_dir:./deploy/cpp_infer/example_video_dir
+--inference_model_name:ppTSN
+--benchmark:True
+--char_list_file:data/k400/Kinetics-400_label_list.txt
+--num_seg:25
diff --git a/test_tipc/configs/PP-TSN/train_amp_infer_python.txt b/test_tipc/configs/PP-TSN/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..46a9f4ac49ac34797d76ffd6d415f0cf7f4febce
--- /dev/null
+++ b/test_tipc/configs/PP-TSN/train_amp_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:PP-TSN
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_vd_ssld_v2_pretrained.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_videos.list' -o DATASET.valid.file_path='data/k400/val_small_videos.list' -o DATASET.test.file_path='data/k400/val_small_videos.list'
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/recognition/pptsn/pptsn_k400_videos.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/recognition/pptsn/pptsn_k400_videos.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/pptsn/pptsn_k400_videos.yaml
+-w:./test_tipc/output/ppTSN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/ppTSN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/ppTSN_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/pptsn/pptsn_k400_videos.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[250, 3, 224, 224]}]
diff --git a/test_tipc/configs/PP-TSN/train_infer_python.txt b/test_tipc/configs/PP-TSN/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..711e8d3e0dbd74503535ca1b0a8530709f29902d
--- /dev/null
+++ b/test_tipc/configs/PP-TSN/train_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:PP-TSN
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_vd_ssld_v2_pretrained.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_videos.list' -o DATASET.valid.file_path='data/k400/val_small_videos.list' -o DATASET.test.file_path='data/k400/val_small_videos.list'
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/recognition/pptsn/pptsn_k400_videos.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/pptsn/pptsn_k400_videos.yaml
+-w:./test_tipc/output/ppTSN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/ppTSN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/ppTSN_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/pptsn/pptsn_k400_videos.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[250, 3, 224, 224]}]
diff --git a/test_tipc/configs/STGCN/train_amp_infer_python.txt b/test_tipc/configs/STGCN/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1af7312b6dad0d1d11d81060046b54b93842aeee
--- /dev/null
+++ b/test_tipc/configs/STGCN/train_amp_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:STGCN
+python:python3.7
+gpu_list:0
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+null:null
+train_model_name:null
+train_infer_video_dir:null
+null:null
+##
+trainer:amp_train
+norm_train:main.py -c configs/recognition/stgcn/stgcn_fsd.yaml --seed 1234 -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' -c configs/recognition/stgcn/stgcn_fsd.yaml --seed 1234 -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+-w:./test_tipc/output/STGCN/STGCN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/STGCN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/STGCN_fsd.pdparams
+infer_export:tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/stgcn/stgcn_fsd.yaml
+--use_gpu:True|False
+--enable_mkldnn:False|True
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/fsd10/example_skeleton.npy
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[2, 350, 25, 1]}]
diff --git a/test_tipc/configs/STGCN/train_infer_python.txt b/test_tipc/configs/STGCN/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f63a591bea492443199e3db37d0b7e1fd409975f
--- /dev/null
+++ b/test_tipc/configs/STGCN/train_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:STGCN
+python:python3.7
+gpu_list:0
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+null:null
+train_model_name:null
+train_infer_video_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:main.py -c configs/recognition/stgcn/stgcn_fsd.yaml --seed 1234 -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -o DATASET.train.file_path="data/fsd10/FSD_train_data.npy" -o DATASET.train.label_path="data/fsd10/FSD_train_label.npy" -o DATASET.test.file_path="data/fsd10/FSD_train_data.npy"
+-w:./test_tipc/output/STGCN/STGCN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/STGCN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/STGCN_fsd.pdparams
+infer_export:tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/stgcn/stgcn_fsd.yaml
+--use_gpu:True|False
+--enable_mkldnn:False|True
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/fsd10/example_skeleton.npy
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[2, 350, 25, 1]}]
diff --git a/test_tipc/configs/SlowFast/train_amp_infer_python.txt b/test_tipc/configs/SlowFast/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3134357960f1fc706fb3d68176a54105a6e9eaff
--- /dev/null
+++ b/test_tipc/configs/SlowFast/train_amp_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:SlowFast
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+-o MODEL.backbone.pretrained:null
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_videos.list' -o DATASET.valid.file_path='data/k400/val_small_videos.list' -o DATASET.test.file_path='data/k400/val_small_videos.list'
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/recognition/slowfast/slowfast.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/recognition/slowfast/slowfast.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/slowfast/slowfast.yaml
+-w:./test_tipc/output/SlowFast/SlowFast_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/SlowFast
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/slowfast_4x16.pdparams
+infer_export:tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/slowfast/slowfast.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3, 4, 256, 256]},{float32,[3, 32, 256, 256]}]
diff --git a/test_tipc/configs/SlowFast/train_infer_python.txt b/test_tipc/configs/SlowFast/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6c060a00b8dfe14765df5c873f0e70bedd3ff781
--- /dev/null
+++ b/test_tipc/configs/SlowFast/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:SlowFast
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+-o MODEL.backbone.pretrained:null
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_videos.list' -o DATASET.valid.file_path='data/k400/val_small_videos.list' -o DATASET.test.file_path='data/k400/val_small_videos.list'
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/recognition/slowfast/slowfast.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/slowfast/slowfast.yaml
+-w:./test_tipc/output/SlowFast/SlowFast_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/SlowFast
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/slowfast_4x16.pdparams
+infer_export:tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/slowfast/slowfast.yaml
+--use_gpu:True|False
+--enable_mkldnn:False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3, 4, 256, 256]},{float32,[3, 32, 256, 256]}]
diff --git a/test_tipc/configs/TSM/train_amp_infer_python.txt b/test_tipc/configs/TSM/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2f8d1c8f204f9b4390677e121e4f84e4ed9786d9
--- /dev/null
+++ b/test_tipc/configs/TSM/train_amp_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:TSM
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_pretrain.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_frames.list' -o DATASET.valid.file_path='data/k400/val_small_frames.list' -o DATASET.test.file_path='data/k400/val_small_frames.list'
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/recognition/tsm/tsm_k400_frames.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/recognition/tsm/tsm_k400_frames.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/tsm/tsm_k400_frames.yaml
+-w:./test_tipc/output/TSM/TSM_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/TSM
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/TSM_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/tsm/tsm_k400_frames.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:30
+fp_items:fp32|fp16
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[8, 3, 224, 224]}]
diff --git a/test_tipc/configs/TSM/train_infer_python.txt b/test_tipc/configs/TSM/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d33b10ae4b289a7ba2c0c228d45ba041531cf435
--- /dev/null
+++ b/test_tipc/configs/TSM/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:TSM
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_pretrain.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_frames.list' -o DATASET.valid.file_path='data/k400/val_small_frames.list' -o DATASET.test.file_path='data/k400/val_small_frames.list'
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/recognition/tsm/tsm_k400_frames.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/tsm/tsm_k400_frames.yaml
+-w:./test_tipc/output/TSM/TSM_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/TSM
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/TSM_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/tsm/tsm_k400_frames.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:30
+fp_items:fp32|fp16
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[8, 3, 224, 224]}]
diff --git a/test_tipc/configs/TSN/train_amp_infer_python.txt b/test_tipc/configs/TSN/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b1b96b997d3b4c4584efbb320df275985e391a82
--- /dev/null
+++ b/test_tipc/configs/TSN/train_amp_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:TSN
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_pretrain.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_frames.list' -o DATASET.valid.file_path='data/k400/val_small_frames.list' -o DATASET.test.file_path='data/k400/val_small_frames.list'
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/recognition/tsn/tsn_k400_frames.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/recognition/tsn/tsn_k400_frames.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/tsn/tsn_k400_frames.yaml
+-w:./test_tipc/output/TSN/TSN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/TSN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/TSN_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/tsn/tsn_k400_frames.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:32
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[250, 3, 224, 224]}]
diff --git a/test_tipc/configs/TSN/train_infer_python.txt b/test_tipc/configs/TSN/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5b7ca90450ac615ca2460c358076abeb7c5823ec
--- /dev/null
+++ b/test_tipc/configs/TSN/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:TSN
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:2
+-o MODEL.backbone.pretrained:'data/ResNet50_pretrain.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_frames.list' -o DATASET.valid.file_path='data/k400/val_small_frames.list' -o DATASET.test.file_path='data/k400/val_small_frames.list'
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/recognition/tsn/tsn_k400_frames.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/tsn/tsn_k400_frames.yaml
+-w:./test_tipc/output/TSN/TSN_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/TSN
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/TSN_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/tsn/tsn_k400_frames.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:32
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[250, 3, 224, 224]}]
diff --git a/test_tipc/configs/TimeSformer/train_amp_infer_python.txt b/test_tipc/configs/TimeSformer/train_amp_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..84d5e45028f477a89afd02d0275665e53537a959
--- /dev/null
+++ b/test_tipc/configs/TimeSformer/train_amp_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:TimeSformer
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+-o MODEL.backbone.pretrained:'data/ViT_base_patch16_224_pretrained.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_videos.list' -o DATASET.valid.file_path='data/k400/val_small_videos.list' -o DATASET.test.file_path='data/k400/val_small_videos.list'
+##
+trainer:amp_train
+norm_train:main.py --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+amp_train:main.py --amp --amp_level='O2' --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml --seed 1234
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+-w:./test_tipc/output/TimeSformer/TimeSformer_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/TimeSformer
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/TimeSformer_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/timesformer/timesformer_k400_videos.yaml
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:1|14
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3, 24, 224, 224]}]
diff --git a/test_tipc/configs/TimeSformer/train_infer_python.txt b/test_tipc/configs/TimeSformer/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c9dffd925649dc5b5e3ed5081faff58a1baeac53
--- /dev/null
+++ b/test_tipc/configs/TimeSformer/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:TimeSformer
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:null|null
+Global.auto_cast:null
+-o epochs:2
+-o output_dir:null
+-o DATASET.batch_size:null
+-o MODEL.backbone.pretrained:'data/ViT_base_patch16_224_pretrained.pdparams'
+train_model_name:null
+train_infer_video_dir:null
+-o DATASET.train.file_path:'data/k400/train_small_videos.list' -o DATASET.valid.file_path='data/k400/val_small_videos.list' -o DATASET.test.file_path='data/k400/val_small_videos.list'
+##
+trainer:norm_train
+norm_train:main.py --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml --seed 1234
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:main.py --test -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+-w:./test_tipc/output/TimeSformer/TimeSformer_epoch_00001.pdparams
+##
+===========================infer_params===========================
+-o:inference/TimeSformer
+-p:null
+norm_export:tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml --save_name inference
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:null
+infer_model:./data/TimeSformer_k400.pdparams
+infer_export:tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+infer_quant:False
+inference:tools/predict.py --config configs/recognition/timesformer/timesformer_k400_videos.yaml
+--use_gpu:True|False
+--enable_mkldnn:False
+--cpu_threads:1|6
+--batch_size:1|2
+--use_tensorrt:False
+--precision:fp32|fp16
+--model_file:inference.pdmodel
+--input_file:./data/example.avi
+null:null
+--enable_benchmark:True
+--params_file:inference.pdiparams
+===========================train_benchmark_params==========================
+batch_size:1|14
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_conv_workspace_size_limit=800
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3, 24, 224, 224]}]
diff --git a/test_tipc/docs/Video_TIPC.png b/test_tipc/docs/Video_TIPC.png
new file mode 100644
index 0000000000000000000000000000000000000000..5031baee8b77de418a3433dbfa07c4b9e021e3e9
Binary files /dev/null and b/test_tipc/docs/Video_TIPC.png differ
diff --git a/test_tipc/docs/benchmark_train.md b/test_tipc/docs/benchmark_train.md
new file mode 100644
index 0000000000000000000000000000000000000000..debbcdb2de018815d350f0ffdfb682c0ca16946f
--- /dev/null
+++ b/test_tipc/docs/benchmark_train.md
@@ -0,0 +1,82 @@
+
+# TIPC Linux端Benchmark测试文档
+
+该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。
+
+# 1. 测试流程
+## 1.1 准备数据和环境安装
+运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程(以TSM模型为例)。
+
+```shell
+# 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode
+bash test_tipc/prepare.sh test_tipc/configs/TSM/train_infer_python.txt benchmark_train
+```
+
+## 1.2 功能测试
+执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析(以TSM模型为例)。
+
+```shell
+# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode
+bash test_tipc/benchmark_train.sh test_tipc/configs/TSM/train_infer_python.txt benchmark_train
+
+```
+
+`test_tipc/benchmark_train.sh`支持根据传入的第三个配置参数实现只运行某一个训练配置,如下(以TSM模型为例):
+```shell
+# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode config_pram
+## 动态图, batchsize=30, fp32, 数据并行模式, 单机单卡训练配置
+bash test_tipc/benchmark_train.sh test_tipc/configs/TSM/train_infer_python.txt benchmark_train dynamic_bs30_fp32_DP_N1C1
+## 动态图, batchsize=30, fp16, 数据并行模式, 单机4卡训练配置
+bash test_tipc/benchmark_train.sh test_tipc/configs/TSM/train_infer_python.txt benchmark_train dynamic_bs30_fp16_DP_N1C4
+```
+dynamic_bs30_fp16_DP_N1C4/benchmark_train.sh传入的参数,格式如下:
+
+`${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}`
+
+包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。
+
+
+## 2. 日志输出
+
+运行后将会把修改的配置文件临时保存到`test_tipc/benchmark_train.txt`,然后使用该临时文件进行训练与分析,并保存模型的训练日志和解析日志。
+
+如TSM模型某一参数文件的训练日志解析结果是:
+
+```json
+{
+ "model_branch": "tipc_benchmark",
+ "model_commit": "c8f93c7fd9908391371bcccf36a4db4398c49777",
+ "model_name": "TSM_bs1_fp16_MultiP_DP",
+ "batch_size": 1,
+ "fp_item": "fp16",
+ "run_process_type": "MultiP",
+ "run_mode": "DP",
+ "convergence_value": 0,
+ "convergence_key": "loss:",
+ "ips": 40.237,
+ "speed_unit": "instance/sec",
+ "device_num": "N1C4",
+ "model_run_time": "28",
+ "frame_commit": "828f87aecd8a47d19f19f0a83155f8dd340eeaa9",
+ "frame_version": "0.0.0"
+}
+```
+
+训练日志和日志解析结果保存在4个目录下,文件组织格式如下(以TSM模型为例):
+```
+PaddleVideo
+├── train_log
+│ ├── PaddleVideo_TSM_bs1_fp16_MultiP_DP_N1C4_log
+│ ├── PaddleVideo_TSM_bs1_fp32_MultiP_DP_N1C4_log
+│
+├── index
+│ ├── PaddleVideo_TSM_bs1_fp16_MultiP_DP_N1C4_speed
+│ ├── PaddleVideo_TSM_bs1_fp32_MultiP_DP_N1C4_speed
+│
+├── profiling_log
+│ ├── PaddleVideo_TSM_bs1_fp32_SingleP_DP_N1C1_profiling
+│ ├── PaddleVideo_TSM_bs1_fp32_SingleP_DP_N1C1_profiling
+│
+├── benchmark_log
+ └── results.log
+```
diff --git a/test_tipc/docs/guide.png b/test_tipc/docs/guide.png
new file mode 100644
index 0000000000000000000000000000000000000000..319ac819daff38ed77e84cdff2b122e8bc4a8e5f
Binary files /dev/null and b/test_tipc/docs/guide.png differ
diff --git a/test_tipc/docs/install.md b/test_tipc/docs/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..e9c32d7f5e12a5bb901a5f630bf7945b374eca89
--- /dev/null
+++ b/test_tipc/docs/install.md
@@ -0,0 +1,127 @@
+## 1. 环境准备
+
+本教程适用于test_tipc目录下基础功能测试的运行环境搭建。
+
+推荐环境:
+- CUDA 10.2
+- CUDNN 7.6.5
+- TensorRT 7.0.0.11
+
+环境配置可以选择docker镜像安装,或者在本地环境Python搭建环境。推荐使用docker镜像安装,避免不必要的环境配置。
+
+## 2. Docker 镜像安装
+
+推荐docker镜像安装,按照如下命令创建镜像,当前目录映射到镜像中的`/paddle`目录下
+```bash
+nvidia-docker run --name paddle -it -v $PWD:/paddle paddlepaddle/paddle:2.0.0rc0-gpu-cuda10.2-cudnn7 /bin/bash
+cd /paddle
+
+# 安装带TRT的paddle
+python3.7 -m pip install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.1-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddlepaddle_gpu-2.1.1.post101-cp37-cp37m-linux_x86_64.whl
+```
+
+## 3. Python 环境构建
+
+推荐环境配置:
+- CUDA10.2 + CUDNN7.6 + TensorRT 7
+
+下面以CUDA10.2 + CUDNN7.6 + TensorRT 7配置为例,介绍环境配置的流程。
+
+### 3.1 安装CUDNN
+
+如果当前环境满足CUDNN版本的要求,可以跳过此步骤。
+
+以CUDNN8.1 安装安装为例,安装步骤如下,首先下载CUDNN,从[Nvidia官网](https://developer.nvidia.com/rdp/cudnn-archive)下载CUDNN 7.6.5版本,下载符合当前系统版本的三个deb文件,分别是:
+- cuDNN Runtime Library ,如:[libcudnn7_7.6.5.32-1+cuda10.2_amd64.deb](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/7.6.5.32/Production/10.2_20191118/Ubuntu16_04-x64/libcudnn7_7.6.5.32-1%2Bcuda10.2_amd64.deb)
+- cuDNN Developer Library ,如:[libcudnn7-dev_7.6.5.32-1+cuda10.2_amd64.deb](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/7.6.5.32/Production/10.2_20191118/Ubuntu16_04-x64/libcudnn7-dev_7.6.5.32-1%2Bcuda10.2_amd64.deb)
+- cuDNN Code Samples,如:[libcudnn7-doc_7.6.5.32-1+cuda10.2_amd64.deb](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/7.6.5.32/Production/10.2_20191118/Ubuntu16_04-x64/libcudnn7-doc_7.6.5.32-1%2Bcuda10.2_amd64.deb)
+
+deb安装可以参考[官方文档](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-deb),安装方式如下
+```bash
+# x.x.x表示下载的版本号
+# $HOME为工作目录
+sudo dpkg -i libcudnn7_x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn7-dev_7.x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn7-doc_7.x.x.x-1+cudax.x_arm64.deb
+
+# 验证是否正确安装
+cp -r /usr/src/cudnn_samples_v7/ $HOME
+cd $HOME/cudnn_samples_v7/mnistCUDNN
+
+# 编译
+make clean && make
+./mnistCUDNN
+```
+如果运行mnistCUDNN完后提示运行成功,则表示安装成功。如果运行后出现freeimage相关的报错,需要按照提示安装freeimage库:
+```bash
+sudo apt-get install libfreeimage-dev
+sudo apt-get install libfreeimage
+```
+
+### 3.2 安装TensorRT
+
+首先,从[Nvidia官网TensorRT板块](https://developer.nvidia.com/tensorrt-getting-started)下载TensorRT,这里选择7.0.0.11版本的TensorRT,注意选择适合自己系统版本和CUDA版本的TensorRT,另外建议**下载TAR package的安装包**。
+
+以Ubuntu16.04+CUDA10.2为例,下载并解压后可以参考[官方文档](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-713/install-guide/index.html#installing-tar)的安装步骤,按照如下步骤安装:
+```bash
+# 以下安装命令中 '${version}' 为下载的TensorRT版本,如7.0.0.11
+# 设置环境变量, 为解压后的TensorRT的lib目录
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:
+
+# 安装TensorRT
+cd TensorRT-${version}/python
+python3.7 -m pip install tensorrt-*-cp37-none-linux_x86_64.whl
+
+# 安装graphsurgeon
+cd TensorRT-${version}/graphsurgeon
+python3.7 -m pip install graphsurgeon-0.4.1-py2.py3-none-any.whl
+```
+
+
+### 3.3 安装PaddlePaddle
+
+下载支持TensorRT版本的Paddle安装包,下载[链接](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html#python)
+选择下载 linux-cuda10.1-trt6-gcc8.2 Python3.7版本的Paddle:
+
+```bash
+# 从下载链接中可以看到是paddle2.1.1-cuda10.2-cudnn8.1版本
+wget https://paddle-wheel.bj.bcebos.com/with-trt/2.1.1-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddlepaddle_gpu-2.1.1.post101-cp37-cp37m-linux_x86_64.whl
+python3.7 -m pip install -U paddlepaddle_gpu-2.1.1.post101-cp37-cp37m-linux_x86_64.whl
+```
+
+## 4. 安装PaddleVideo依赖
+```bash
+# 安装AutoLog
+git clone https://github.com/LDOUBLEV/AutoLog
+cd AutoLog
+python3.7 -m pip install -r requirements.txt
+python3.7 setup.py bdist_wheel
+python3.7 -m pip install ./dist/auto_log-1.0.0-py3-none-any.whl
+
+# 克隆PaddleVideo代码
+cd ../
+git clone https://github.com/PaddlePaddle/PaddleVideo.git
+
+```
+
+安装PaddleVideo依赖:
+```bash
+cd PaddleVideo
+python3.7 -m pip install -r requirements.txt
+```
+
+## FAQ :
+Q. You are using Paddle compiled with TensorRT, but TensorRT dynamic library is not found. Ignore this if TensorRT is not needed.
+
+A. 问题一般是当前安装paddle版本带TRT,但是本地环境找不到TensorRT的预测库,需要下载TensorRT库,解压后设置环境变量LD_LIBRARY_PATH;
+如:
+
+```bash
+export PATH=$PATH:/usr/local/cuda-10.2/bin
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-10.2/lib64
+export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda-10.2/lib64
+source /etc/profile
+export LD_LIBRARY_PATH=/xx/xx/TensorRT-7.0.0.11/lib:$LD_LIBRARY_PATH
+```
+或者问题是下载的TensorRT版本和当前paddle中编译的TRT版本不匹配,需要下载版本相符的TensorRT重新安装。
+
diff --git a/test_tipc/docs/test_inference_cpp.md b/test_tipc/docs/test_inference_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a060e6ecbc93381b50b4f13253d62504bef120d
--- /dev/null
+++ b/test_tipc/docs/test_inference_cpp.md
@@ -0,0 +1,95 @@
+# C++预测功能测试
+
+C++预测功能测试的主程序为`test_inference_cpp.sh`,可以测试基于C++预测库的模型推理功能。
+
+## 1. 测试结论汇总
+
+基于训练是否使用量化,进行本测试的模型可以分为`正常模型`和`量化模型`(TODO),这两类模型对应的C++预测功能汇总如下:
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+| ---- | ---- | ---- | :----: | :----: | :----: |
+| 正常模型 | GPU | 1/6 | fp32/fp16 | - | - |
+| 正常模型 | CPU | 1/6 | - | fp32 | 支持 |
+
+## 2. 测试流程
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+
+### 2.1 功能测试
+先运行`prepare.sh`准备数据和模型,然后运行`test_inference_cpp.sh`进行测试,最终在```test_tipc/output```目录下生成`cpp_infer_*.log`后缀的日志文件。
+
+```bash
+bash test_tipc/prepare.sh test_tipc/configs/PP-TSM/PP-TSM_infer_cpp.txt 'cpp_infer'
+```
+```bash
+# 用法1:
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/PP-TSM/PP-TSM_infer_cpp.txt
+# 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/PP-TSM/PP-TSM_infer_cpp.txt 1
+```
+
+运行预测指令后,在`test_tipc/output`文件夹下自动会保存运行日志,包括以下文件:
+
+```shell
+test_tipc/PP-TSM/output/
+ ├── results_cpp.log # 运行指令状态的日志
+ ├── cpp_infer_cpu_usemkldnn_False_threads_1_precision_fp32_batchsize_1.log # CPU上不开启Mkldnn,线程数设置为1,测试batch_size=1条件下的预测运行日志
+ ├── cpp_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_1.log # CPU上不开启Mkldnn,线程数设置为6,测试batch_size=1条件下的预测运行日志
+ ├── cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log # GPU上不开启TensorRT,测试batch_size=1的fp32精度预测日志
+ ├── cpp_infer_gpu_usetrt_True_precision_fp16_batchsize_1.log # GPU上开启TensorRT,测试batch_size=1的fp16精度预测日志
+......
+```
+其中results_cpp.log中包含了每条指令的运行状态,如果运行成功会输出:
+
+```
+Run successfully with command - ./deploy/cpp_infer/build/ppvideo rec --use_gpu=True --use_tensorrt=False --precision=fp32 --rec_model_dir=./inference/ppTSM --rec_batch_num=1 --video_dir=./deploy/cpp_infer/example_video_dir --benchmark=True --inference_model_name=ppTSM --char_list_file=data/k400/Kinetics-400_label_list.txt --num_seg=8 > ./test_tipc/output/PP-TSM/cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log 2>&1
+......
+```
+如果运行失败,会输出:
+```
+Run failed with command - ./deploy/cpp_infer/build/ppvideo rec --use_gpu=False --enable_mkldnn=False --cpu_threads=1 --rec_model_dir=./inference/ppTSM --rec_batch_num=1 --video_dir=./deploy/cpp_infer/example_video_dir --benchmark=True --inference_model_name=ppTSM --char_list_file=data/k400/Kinetics-400_label_list.txt --num_seg=8 > ./test_tipc/output/PP-TSM/cpp_infer_cpu_usemkldnn_False_threads_1_precision_fp32_batchsize_1.log 2>&1
+......
+```
+可以很方便的根据results_cpp.log中的内容判定哪一个指令运行错误。
+
+
+### 2.2 精度测试
+
+使用compare_results.py脚本比较模型预测的结果是否符合预期,主要步骤包括:
+- 提取预测输出文本的结果
+- 提取本地参考输出文本结果
+- 比较上述两个结果是否符合精度预期,误差大于设置阈值时会报错。
+
+#### 使用方式
+运行命令:
+```shell
+python3.7 test_tipc/compare_results.py --gt_file "test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp*.txt" --log_file "test_tipc/output/PP-TSM/cpp_infer_*.log" --atol=1e-3 --rtol=1e-3
+```
+
+参数介绍:
+- gt_file: 指向事先保存好的预测结果路径,支持*.txt 结尾,会自动索引*.txt格式的文件,文件默认保存在test_tipc/result/ 文件夹下
+- log_file: 指向运行test_tipc/test_inference_cpp.sh 脚本的infer模式保存的预测日志,预测日志中打印的有预测结果,比如:文本框,预测文本,类别等等,同样支持cpp_infer_*.log格式传入
+- atol: 设置的绝对误差
+- rtol: 设置的相对误差
+
+#### 运行结果
+
+正常运行输出示例:
+```bash
+Assert allclose passed! The results of cpp_infer_cpu_usemkldnn_True_threads_1_precision_fp32_batchsize_1.log and test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt are consistent!
+Assert allclose passed! The results of cpp_infer_cpu_usemkldnn_False_threads_1_precision_fp32_batchsize_1.log and test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt are consistent!
+Assert allclose passed! The results of cpp_infer_gpu_usetrt_True_precision_fp16_batchsize_1.log and test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp16.txt are consistent!
+Assert allclose passed! The results of cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log and test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt are consistent!
+Assert allclose passed! The results of cpp_infer_cpu_usemkldnn_True_threads_6_precision_fp32_batchsize_1.log and test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt are consistent!
+Assert allclose passed! The results of cpp_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_1.log and test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt are consistent!
+Assert allclose passed! The results of cpp_infer_gpu_usetrt_True_precision_fp32_batchsize_1.log and test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt are consistent!
+```
+
+出现不一致结果时的运行输出示例:
+```bash
+ValueError: The results of cpp_infer_cpu_usemkldnn_True_threads_1_precision_fp32_batchsize_1.log and the results of test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp32.txt are inconsistent!
+```
+
+
+## 3. 更多教程
+
+本文档为功能测试用,更详细的C++预测使用教程请参考:[服务器端C++预测](../../deploy/cpp_infer/readme.md)
diff --git a/test_tipc/docs/test_train_amp_inference_python.md b/test_tipc/docs/test_train_amp_inference_python.md
new file mode 100644
index 0000000000000000000000000000000000000000..efc167e1c5128f533cb2c98c91e4b7a9b1844f44
--- /dev/null
+++ b/test_tipc/docs/test_train_amp_inference_python.md
@@ -0,0 +1,124 @@
+# Linux GPU/CPU 混合精度训练推理测试
+
+Linux GPU/CPU 混合精度训练推理测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能。
+
+## 1. 测试结论汇总
+
+- 训练相关:
+
+ | 算法名称 | 模型名称 | 单机单卡 | 单机多卡 |
+ | :---- | :---- | :---- | :---- |
+ | PP-TSM | pptsm_k400_frames_uniform | 混合精度训练 | 混合精度训练 |
+ | PP-TSN | pptsn_k400_videos | 混合精度训练 | 混合精度训练 |
+ | AGCN | agcn_fsd | 混合精度训练 | - |
+ | STGCN | stgcn_fsd | 混合精度训练 | - |
+ | TimeSformer | timesformer_k400_videos | 混合精度训练 | 混合精度训练 |
+ | SlowFast | slowfast | 混合精度训练 | 混合精度训练 |
+ | TSM | tsm_k400_frames | 混合精度训练 | 混合精度训练 |
+ | TSN | tsn_k400_frames | 混合精度训练 | 混合精度训练 |
+ | AttentionLSTM| attention_lstm_youtube8m | 混合精度训练 | 混合精度训练 |
+ | BMN | bmn | 混合精度训练 | 混合精度训练 |
+
+
+- 推理相关:
+
+ | 算法名称 | 模型名称 | device_CPU | device_GPU | batchsize |
+ | :---- | :---- | :---- | :---- | :---- |
+ | PP-TSM | pptsm_k400_frames_uniform | 支持 | 支持 | 1/2 |
+ | PP-TSN | pptsn_k400_videos | 支持 | 支持 | 1/2 |
+ | AGCN | agcn_fsd | 支持 | 支持 | 1/2 |
+ | STGCN | stgcn_fsd | 支持 | 支持 | 1/2 |
+ | TimeSformer | timesformer_k400_videos | 支持 | 支持 | 1/2 |
+ | SlowFast | slowfast | 支持 | 支持 | 1/2 |
+ | TSM | tsm_k400_frames | 支持 | 支持 | 1/2 |
+ | TSN | tsn_k400_frames | 支持 | 支持 | 1/2 |
+ | AttentionLSTM| attention_lstm_youtube8m | 支持 | 支持 | 1/2 |
+ | BMN | bmn | 支持 | 支持 | 1 |
+## 2. 测试流程
+
+### 2.1 准备环境
+
+
+- 安装PaddlePaddle:如果您已经安装了2.2或者以上版本的paddlepaddle,那么无需运行下面的命令安装paddlepaddle。
+ ```
+ # 需要安装2.2及以上版本的Paddle
+ # 安装GPU版本的Paddle
+ pip install paddlepaddle-gpu==2.2.0
+ # 安装CPU版本的Paddle
+ pip install paddlepaddle==2.2.0
+ ```
+
+- 安装依赖
+ ```
+ pip install -r requirements.txt
+ ```
+- 安装AutoLog(规范化日志输出工具)
+ ```
+ pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl
+ ```
+
+### 2.2 功能测试
+
+
+测试方法如下所示,希望测试不同的模型文件,只需更换为自己的参数配置文件,即可完成对应模型的测试。
+
+```bash
+bash test_tipc/test_train_inference_python.sh ${your_params_file_path} lite_train_lite_infer
+```
+
+以`PP-TSM`的`Linux GPU/CPU 混合精度(默认优化等级为O2)训练推理测试`为例,命令如下所示。
+
+```bash
+bash test_tipc/prepare.sh test_tipc/configs/PP-TSM/train_amp_infer_python.txt lite_train_lite_infer
+```
+
+```bash
+bash test_tipc/test_train_inference_python.sh test_tipc/configs/PP-TSM/train_amp_infer_python.txt lite_train_lite_infer
+```
+
+输出结果如下,表示命令运行成功。
+
+```bash
+Run successfully with command - python3.7 main.py --amp --amp_level='O1' --validate -c configs/recognition/tsm/tsm_k400_frames.yaml --seed 1234 --max_iters=30 -o output_dir=./test_tipc/output/TSM/amp_train_gpus_0_autocast_null -o epochs=2 -o MODEL.backbone.pretrained='data/ResNet50_pretrain.pdparams' -o DATASET.batch_size=2 -o DATASET.train.file_path='data/k400/train_small_frames.list' -o DATASET.valid.file_path='data/k400/val_small_frames.list' -o DATASET.test.file_path='data/k400/val_small_frames.list' !
+
+........
+
+Run successfully with command - python3.7 tools/predict.py --config configs/recognition/tsm/tsm_k400_frames.yaml --use_gpu=False --enable_mkldnn=False --cpu_threads=6 --model_file=./test_tipc/output/TSM/amp_train_gpus_0,1_autocast_null/inference.pdmodel --batch_size=2 --input_file=./data/example.avi --enable_benchmark=False --precision=fp32 --params_file=./test_tipc/output/TSM/amp_train_gpus_0,1_autocast_null/inference.pdiparams > ./test_tipc/output/TSM/python_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_2.log 2>&1 !
+
+```
+
+在开启benchmark选项时,可以得到测试的详细数据,包含运行环境信息(系统版本、CUDA版本、CUDNN版本、驱动版本),Paddle版本信息,参数设置信息(运行设备、线程数、是否开启内存优化等),模型信息(模型名称、精度),数据信息(batchsize、是否为动态shape等),性能信息(CPU/GPU的占用、运行耗时、预处理耗时、推理耗时、后处理耗时),内容如下所示:
+
+```log
+[2022/03/18 12:01:21] root INFO: ---------------------- Env info ----------------------
+[2022/03/18 12:01:21] root INFO: OS_version: Ubuntu 16.04
+[2022/03/18 12:01:21] root INFO: CUDA_version: 10.2.89
+[2022/03/18 12:01:21] root INFO: CUDNN_version: 7.6.5
+[2022/03/18 12:01:21] root INFO: drivier_version: 440.64.00
+[2022/03/18 12:01:21] root INFO: ---------------------- Paddle info ----------------------
+[2022/03/18 12:01:21] root INFO: paddle_version: 0.0.0
+[2022/03/18 12:01:21] root INFO: paddle_commit: 6849d33b62cacccb27797375a212e37a47ca9484
+[2022/03/18 12:01:21] root INFO: log_api_version: 1.0
+[2022/03/18 12:01:21] root INFO: ----------------------- Conf info -----------------------
+[2022/03/18 12:01:21] root INFO: runtime_device: gpu
+[2022/03/18 12:01:21] root INFO: ir_optim: True
+[2022/03/18 12:01:21] root INFO: enable_memory_optim: True
+[2022/03/18 12:01:21] root INFO: enable_tensorrt: False
+[2022/03/18 12:01:21] root INFO: enable_mkldnn: False
+[2022/03/18 12:01:21] root INFO: cpu_math_library_num_threads: 1
+[2022/03/18 12:01:21] root INFO: ----------------------- Model info ----------------------
+[2022/03/18 12:01:21] root INFO: model_name: ppTSM
+[2022/03/18 12:01:21] root INFO: precision: fp32
+[2022/03/18 12:01:21] root INFO: ----------------------- Data info -----------------------
+[2022/03/18 12:01:21] root INFO: batch_size: 2
+[2022/03/18 12:01:21] root INFO: input_shape: dynamic
+[2022/03/18 12:01:21] root INFO: data_num: 30
+[2022/03/18 12:01:21] root INFO: ----------------------- Perf info -----------------------
+[2022/03/18 12:01:21] root INFO: cpu_rss(MB): 2062.625, gpu_rss(MB): 2111.0, gpu_util: 100.0%
+[2022/03/18 12:01:21] root INFO: total time spent(s): 5.5024
+[2022/03/18 12:01:21] root INFO: preprocess_time(ms): 247.8535, inference_time(ms): 26.6164, postprocess_time(ms): 0.6504
+```
+
+该信息可以在运行log中查看,以`PP-TSM`为例,上述的log完整信息文件位置在`./test_tipc/output/PP-TSM/python_infer_gpu_usetrt_False_precision_fp32_batchsize_2.log`。
+
+如果运行失败,也会在终端中输出运行失败的日志信息以及对应的运行命令。可以基于该命令,分析运行失败的原因。
diff --git a/test_tipc/docs/test_train_inference_python.md b/test_tipc/docs/test_train_inference_python.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1d220961b50d62b74cb532770dc8813ea8a02b9
--- /dev/null
+++ b/test_tipc/docs/test_train_inference_python.md
@@ -0,0 +1,131 @@
+# Linux端基础训练预测功能测试
+
+Linux端基础训练预测功能测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能,包括裁剪(TODO)、量化(TODO)、蒸馏。
+
+- Mac端基础训练预测功能测试参考[TODO]()
+- Windows端基础训练预测功能测试参考[TODO]()
+
+## 1. 测试结论汇总
+
+- 训练相关:
+
+ | 算法名称 | 模型名称 | 单机单卡 | 单机多卡 | 多机多卡 | 模型压缩(单机多卡) |
+ | :---- | :---- | :---- | :---- | :---- | :---- |
+ | PP-TSM | pptsm_k400_frames_uniform | 正常训练 | 正常训练 | - | - |
+ | PP-TSN | pptsn_k400_videos | 正常训练 | 正常训练 | - | - |
+ | AGCN | agcn_fsd | 正常训练 | - | - | - |
+ | STGCN | stgcn_fsd | 正常训练 | - | - | - |
+ | TimeSformer | timesformer_k400_videos | 正常训练 | 正常训练 | - | - |
+ | SlowFast | slowfast | 正常训练 | 正常训练 | - | - |
+ | TSM | tsm_k400_frames | 正常训练 | 正常训练 | - | - |
+ | TSN | tsn_k400_frames | 正常训练 | 正常训练 | - | - |
+ | AttentionLSTM | attention_lstm_youtube8m | 正常训练 | 正常训练 | - | - |
+ | BMN | bmn | 正常训练 | 正常训练 | - | - |
+
+
+- 预测相关:基于训练是否使用量化,可以将训练产出的模型可以分为`正常模型`和`量化模型(TODO)`,这两类模型对应的预测功能汇总如下,
+
+ | 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+ | ---- | ---- | ---- | :----: | :----: | :----: |
+ | 正常模型 | GPU | 1/2 | fp32/fp16 | - | 1/6 |
+ | 正常模型 | CPU | 1/2 | - | fp32/fp16 | 1/6 |
+
+
+## 2. 测试流程
+
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+
+### 2.1 安装依赖
+- 安装对应软硬件环境下的PaddlePaddle(>=2.0)
+
+- 安装PaddleVideo依赖
+ ```
+ # 需在PaddleVideo目录下执行
+ python3.7 -m pip install -r requirements.txt
+ ```
+- 安装autolog(规范化日志输出工具)
+ ```
+ git clone https://github.com/LDOUBLEV/AutoLog
+ cd AutoLog
+ python3.7 -m pip install -r requirements.txt
+ python3 setup.py bdist_wheel
+ python3.7 -m pip install ./dist/auto_log-1.0.0-py3-none-any.whl
+ cd ../
+ ```
+- 安装PaddleSlim (可选)
+ ```
+ # 如果要测试量化、裁剪等功能,则需用以下命令安装PaddleSlim
+ python3.7 -m pip install paddleslim
+ ```
+
+
+### 2.2 基本功能测试
+1. 先运行`prepare.sh`,根据传入模型名字,准备对应数据和预训练模型参数
+2. 再运行`test_train_inference_python.sh`,根据传入模型名字,进行对应测试
+3. 在`test_tipc/output`目录下生成 `python_infer_*.log` 格式的日志文件
+
+具体地,以PP-TSM的测试链条为例,运行细节如下:
+
+`test_train_inference_python.sh` 包含5种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是:
+
+- 模式1:**lite_train_lite_infer**,使用少量数据训练,用于快速验证训练到预测的走通流程,不验证精度和速度;
+ ```shell
+ bash test_tipc/prepare.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'
+ bash test_tipc/test_train_inference_python.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'
+ ```
+
+- 模式2:**lite_train_whole_infer**,使用少量数据训练,一定量数据预测,用于验证训练后的模型执行预测,预测速度是否合理;
+ ```shell
+ bash test_tipc/prepare.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_whole_infer'
+ bash test_tipc/test_train_inference_python.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_whole_infer'
+ ```
+
+- 模式3:**whole_infer**,不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度;
+ ```shell
+ bash test_tipc/prepare.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'whole_infer'
+ # 用法1:
+ bash test_tipc/test_train_inference_python.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'whole_infer'
+ # 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号
+ bash test_tipc/test_train_inference_python.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'whole_infer' '1'
+ ```
+
+- 模式4:**whole_train_whole_infer**: 全量数据训练,全量数据预测,验证模型训练精度,预测精度,预测速度;
+ ```shell
+ bash test_tipc/prepare.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'whole_train_whole_infer'
+ bash test_tipc/test_train_inference_python.sh test_tipc/configs/PP-TSM/train_infer_python.txt 'whole_train_whole_infer'
+ ```
+
+
+最终在`tests/output/model_name`目录下生成.log后缀的日志文件
+
+
+### 2.3 精度测试
+
+使用compare_results.py脚本比较模型预测的结果是否符合预期,主要步骤包括:
+- 提取`*.log`日志中的预测结果,包括类别和概率
+- 从本地文件中提取保存好的真值结果;
+- 比较上述两个结果是否符合精度预期,误差大于设置阈值时会报错。
+
+#### 使用方式
+运行命令:
+```shell
+python3.7 test_tipc/compare_results.py --gt_file="test_tipc/results/python_*.txt" --log_file="test_tipc/output/python_*.log" --atol=1e-3 --rtol=1e-3
+```
+
+参数介绍:
+- gt_file: 指向事先保存好的预测结果路径,支持*.txt 结尾,会自动索引*.txt格式的文件,文件默认保存在test_tipc/result/ 文件夹下
+- log_file: 指向运行test_tipc/test_train_inference_python.sh 脚本的infer模式保存的预测日志,预测日志中打印的有预测结果,比如:预测文本,类别等等,同样支持python_infer_*.log格式传入
+- atol: 设置的绝对误差
+- rtol: 设置的相对误差
+
+#### 运行结果
+
+正常运行效果如下:
+```bash
+Assert allclose passed! The results of python_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_16.log and ./test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp32.txt are consistent!
+```
+
+出现不一致结果时的样例输出如下:
+```bash
+ValueError: The results of python_infer_gpu_usetrt_False_precision_fp32_batchsize_8.log and the results of ./test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp32.txt are inconsistent!
+```
diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d29e6ee32259f2089ca2d083f2fccf9612b3ceb4
--- /dev/null
+++ b/test_tipc/prepare.sh
@@ -0,0 +1,469 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+
+# set -xe
+
+:< train_small.list # 将train*.pkl的路径写入train_small.list
+ ls pkl_frame/validate*.pkl > val_small.list # 将validate*.pkl的路径写入val_small.list
+
+ ${python} split_yt8m.py train_small.list # 拆分每个train*.pkl变成多个train*_split*.pkl
+ ${python} split_yt8m.py val_small.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
+
+ ls pkl_frame/train*_split*.pkl > train_small.list # 将train*_split*.pkl的路径重新写入train_small.list
+ ls pkl_frame/validate*_split*.pkl > val_small.list # 将validate*_split*.pkl的路径重新写入val_small.list
+ popd
+ elif [ ${model_name} == "SlowFast" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+ tar -xf k400_videos_small.tar
+ popd
+ elif [ ${model_name} == "BMN" ]; then
+ # pretrain lite train data
+ pushd ./data
+ mkdir bmn_data
+ cd bmn_data
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
+ tar -xf bmn_feat.tar.gz
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+ popd
+ else
+ echo "Not added into TIPC yet."
+ fi
+
+elif [ ${MODE} = "whole_train_whole_infer" ];then
+ if [ ${model_name} == "PP-TSM" ]; then
+ # pretrain whole train data
+ pushd ./data/k400
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+ bash download_k400_data.sh train_link.list
+ bash download_k400_data.sh val_link.list
+ ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file
+ # download annotations
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+ elif [ ${model_name} == "PP-TSN" ]; then
+ # pretrain whole train data
+ pushd ./data/k400
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+ bash download_k400_data.sh train_link.list
+ bash download_k400_data.sh val_link.list
+ # download annotations
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+ elif [ ${model_name} == "AGCN" ]; then
+ # pretrain whole train data
+ pushd data/fsd10
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+ popd
+ elif [ ${model_name} == "STGCN" ]; then
+ # pretrain whole train data
+ pushd data/fsd10
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+ popd
+ elif [ ${model_name} == "TSM" ]; then
+ # pretrain whole train data
+ pushd ./data/k400
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+ bash download_k400_data.sh train_link.list
+ bash download_k400_data.sh val_link.list
+ ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file
+ # download annotations
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+ elif [ ${model_name} == "TSN" ]; then
+ # pretrain whole train data
+ pushd ./data/k400
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+ bash download_k400_data.sh train_link.list
+ bash download_k400_data.sh val_link.list
+ ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file
+ # download annotations
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+ elif [ ${model_name} == "TimeSformer" ]; then
+ # pretrain whole train data
+ pushd ./data/k400
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+ bash download_k400_data.sh train_link.list
+ bash download_k400_data.sh val_link.list
+ # download annotations
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+ elif [ ${model_name} == "AttentionLSTM" ]; then
+ # pretrain whole train data
+ pushd data/yt8m
+ mkdir frame
+ cd frame
+ ## download & decompression training data
+ curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
+ curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
+ ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+ cd ..
+ ${python} tf2pkl.py ./frame ./pkl_frame/
+ ls pkl_frame/train*.pkl > train.list # 将train*.pkl的路径写入train.list
+ ls pkl_frame/validate*.pkl > val.list # 将validate*.pkl的路径写入val.list
+
+ ${python} split_yt8m.py train.list # 拆分每个train*.pkl变成多个train*_split*.pkl
+ ${python} split_yt8m.py val.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
+
+ ls pkl_frame/train*_split*.pkl > train.list # 将train*_split*.pkl的路径重新写入train.list
+ ls pkl_frame/validate*_split*.pkl > val.list # 将validate*_split*.pkl的路径重新写入val.list
+ popd
+ elif [ ${model_name} == "SlowFast" ]; then
+ # pretrain whole train data
+ pushd ./data/k400
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+ wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+ bash download_k400_data.sh train_link.list
+ bash download_k400_data.sh val_link.list
+ # download annotations
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list
+ wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list
+ popd
+ elif [ ${model_name} == "BMN" ]; then
+ # pretrain whole train data
+ pushd ./data
+ mkdir bmn_data
+ cd bmn_data
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
+ tar -xf bmn_feat.tar.gz
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+ popd
+ else
+ echo "Not added into TIPC yet."
+ fi
+elif [ ${MODE} = "lite_train_whole_infer" ];then
+ if [ ${model_name} == "PP-TSM" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+ tar -xf k400_rawframes_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+ elif [ ${model_name} == "PP-TSN" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+ tar -xf k400_videos_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+ elif [ ${model_name} == "AGCN" ]; then
+ # pretrain lite train data
+ pushd data/fsd10
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+ popd
+ elif [ ${model_name} == "STGCN" ]; then
+ # pretrain lite train data
+ pushd data/fsd10
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+ wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+ popd
+ elif [ ${model_name} == "TSM" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+ tar -xf k400_rawframes_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+ elif [ ${model_name} == "TSN" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+ tar -xf k400_rawframes_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+ elif [ ${model_name} == "TimeSformer" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+ tar -xf k400_videos_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+ elif [ ${model_name} == "AttentionLSTM" ]; then
+ # pretrain lite train data
+ pushd data/yt8m
+ ## download & decompression training data
+ wget -nc https://videotag.bj.bcebos.com/Data/yt8m_rawframe_small.tar
+ tar -xf yt8m_rawframe_small.tar
+ ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+ ${python} tf2pkl.py ./frame ./pkl_frame/
+ ls pkl_frame/train*.pkl > train_small.list # 将train*.pkl的路径写入train_small.list
+ ls pkl_frame/validate*.pkl > val_small.list # 将validate*.pkl的路径写入val_small.list
+
+ ${python} split_yt8m.py train_small.list # 拆分每个train*.pkl变成多个train*_split*.pkl
+ ${python} split_yt8m.py val_small.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
+
+ ls pkl_frame/train*_split*.pkl > train_small.list # 将train*_split*.pkl的路径重新写入train_small.list
+ ls pkl_frame/validate*_split*.pkl > val_small.list # 将validate*_split*.pkl的路径重新写入val_small.list
+ popd
+ elif [ ${model_name} == "SlowFast" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+ tar -xf k400_videos_small.tar
+ popd
+ elif [ ${model_name} == "BMN" ]; then
+ # pretrain lite train data
+ pushd ./data
+ mkdir bmn_data
+ cd bmn_data
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
+ tar -xf bmn_feat.tar.gz
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+ popd
+ else
+ echo "Not added into TIPC yet."
+ fi
+elif [ ${MODE} = "whole_infer" ];then
+ if [ ${model_name} = "PP-TSM" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate
+ elif [ ${model_name} = "PP-TSN" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate
+ elif [ ${model_name} == "AGCN" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams --no-check-certificate
+ elif [ ${model_name} == "STGCN" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams --no-check-certificate
+ elif [ ${model_name} == "TSM" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams --no-check-certificate
+ elif [ ${model_name} == "TSN" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams --no-check-certificate
+ elif [ ${model_name} == "TimeSformer" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams --no-check-certificate
+ elif [ ${model_name} == "AttentionLSTM" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams --no-check-certificate
+ elif [ ${model_name} == "SlowFast" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams --no-check-certificate
+ elif [ ${model_name} == "BMN" ]; then
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams --no-check-certificate
+ else
+ echo "Not added into TIPC yet."
+ fi
+fi
+
+if [ ${MODE} = "benchmark_train" ];then
+ ${python} -m pip install -r requirements.txt
+ if [ ${model_name} == "PP-TSM" ]; then
+ echo "Not added into TIPC yet."
+ elif [ ${model_name} == "PP-TSN" ]; then
+ echo "Not added into TIPC yet."
+ elif [ ${model_name} == "AGCN" ]; then
+ echo "Not added into TIPC yet."
+ elif [ ${model_name} == "STGCN" ]; then
+ echo "Not added into TIPC yet."
+ elif [ ${model_name} == "TSM" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+ tar -xf k400_rawframes_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+ elif [ ${model_name} == "TSN" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+ tar -xf k400_rawframes_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+ elif [ ${model_name} == "TimeSformer" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+ tar -xf k400_videos_small.tar
+ popd
+ # download pretrained weights
+ wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+ elif [ ${model_name} == "AttentionLSTM" ]; then
+ echo "Not added into TIPC yet."
+ elif [ ${model_name} == "SlowFast" ]; then
+ # pretrain lite train data
+ pushd ./data/k400
+ wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+ tar -xf k400_videos_small.tar
+ popd
+ elif [ ${model_name} == "BMN" ]; then
+ # pretrain lite train data
+ pushd ./data
+ mkdir bmn_data
+ cd bmn_data
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
+ tar -xf bmn_feat.tar.gz
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json
+ wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+ popd
+ else
+ echo "Not added into TIPC yet."
+ fi
+fi
+
+if [ ${MODE} = "klquant_whole_infer" ]; then
+ echo "Not added into TIPC now."
+fi
+
+if [ ${MODE} = "cpp_infer" ];then
+ # install required packages
+ apt-get update
+ apt install libavformat-dev
+ apt install libavcodec-dev
+ apt install libswresample-dev
+ apt install libswscale-dev
+ apt install libavutil-dev
+ apt install libsdl1.2-dev
+ apt-get install ffmpeg
+
+ if [ ${model_name} = "PP-TSM" ]; then
+ # download pretrained weights
+ wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate
+ # export inference model
+ ${python} tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml -p data/ppTSM_k400_uniform.pdparams -o ./inference/ppTSM
+ elif [ ${model_name} = "PP-TSN" ]; then
+ # download pretrained weights
+ wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate
+ # export inference model
+ ${python} tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml -p data/ppTSN_k400.pdparams -o ./inference/ppTSN
+ else
+ echo "Not added into TIPC now."
+ fi
+fi
+
+if [ ${MODE} = "serving_infer" ];then
+ echo "Not added into TIPC now."
+fi
+
+if [ ${MODE} = "paddle2onnx_infer" ];then
+ echo "Not added into TIPC now."
+fi
diff --git a/test_tipc/results/AGCN/python_ppvideo_AGCN_results_fp16.txt b/test_tipc/results/AGCN/python_ppvideo_AGCN_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aaae6a042d442678c2ce844e93ef5c76da653fa5
--- /dev/null
+++ b/test_tipc/results/AGCN/python_ppvideo_AGCN_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.8965644240379333
diff --git a/test_tipc/results/AGCN/python_ppvideo_AGCN_results_fp32.txt b/test_tipc/results/AGCN/python_ppvideo_AGCN_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aaae6a042d442678c2ce844e93ef5c76da653fa5
--- /dev/null
+++ b/test_tipc/results/AGCN/python_ppvideo_AGCN_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.8965644240379333
diff --git a/test_tipc/results/AttentionLSTM/python_ppvideo_AttentionLSTM_results_fp16.txt b/test_tipc/results/AttentionLSTM/python_ppvideo_AttentionLSTM_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ecc6a423cac1029be093598a3b1ce0f35f1ede1
--- /dev/null
+++ b/test_tipc/results/AttentionLSTM/python_ppvideo_AttentionLSTM_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.pkl
+ top-1 class: 11
+ top-1 score: 0.9840923547744751
diff --git a/test_tipc/results/AttentionLSTM/python_ppvideo_AttentionLSTM_results_fp32.txt b/test_tipc/results/AttentionLSTM/python_ppvideo_AttentionLSTM_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95457c0c960e8c908cf2999c1b8efc083774730c
--- /dev/null
+++ b/test_tipc/results/AttentionLSTM/python_ppvideo_AttentionLSTM_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.pkl
+ top-1 class: 11
+ top-1 score: 0.9841002225875854
diff --git a/test_tipc/results/BMN/python_ppvideo_BMN_results_fp16.txt b/test_tipc/results/BMN/python_ppvideo_BMN_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..66c756eea572ded6e7001df9f6046240399b7875
--- /dev/null
+++ b/test_tipc/results/BMN/python_ppvideo_BMN_results_fp16.txt
@@ -0,0 +1,6 @@
+Current video file: data/example_feat.npy :
+{'score': 0.7967117428779602, 'segment': [0.0, 122.9877]}
+{'score': 0.49079903960227966, 'segment': [12.423000000000002, 124.23]}
+{'score': 0.21400144696235657, 'segment': [39.7536, 122.9877]}
+{'score': 0.210616335272789, 'segment': [0.0, 109.3224]}
+{'score': 0.06873712688684464, 'segment': [23.6037, 114.2916]}
diff --git a/test_tipc/results/BMN/python_ppvideo_BMN_results_fp32.txt b/test_tipc/results/BMN/python_ppvideo_BMN_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ae9a11121ce14258de63e7b96c8468a492c2b00b
--- /dev/null
+++ b/test_tipc/results/BMN/python_ppvideo_BMN_results_fp32.txt
@@ -0,0 +1,6 @@
+Current video file: data/example_feat.npy :
+{'score': 0.7968077063560486, 'segment': [0.0, 122.9877]}
+{'score': 0.49097612500190735, 'segment': [12.423000000000002, 124.23]}
+{'score': 0.21395836770534515, 'segment': [39.7536, 122.9877]}
+{'score': 0.2106524258852005, 'segment': [0.0, 109.3224]}
+{'score': 0.06876271963119507, 'segment': [23.6037, 114.2916]}
diff --git a/test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp16.txt b/test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..941f5225a901e3fe5ee4e626f500bfaf12bbc0fa
--- /dev/null
+++ b/test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: ./data/example.avi
+ top-1 class: 5
+ top-1 score: 0.990794837474823
diff --git a/test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp32.txt b/test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6035dc3c511c3e19bd8e29ea09bbf5ba2460a221
--- /dev/null
+++ b/test_tipc/results/PP-TSM/python_ppvideo_PP-TSM_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: ./data/example.avi
+ top-1 class: 5
+ top-1 score: 0.990738570690155
diff --git a/test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp16.txt b/test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..14a02fc3f7b7fe9ea41360f67107244c16527e90
--- /dev/null
+++ b/test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp16.txt
@@ -0,0 +1 @@
+./deploy/cpp_infer/example_video_dir/example01.avi class: 5 archery score: 0.988793
diff --git a/test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp32.txt b/test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a43e1527a8e5afcd5b72326f02c0129bff93c756
--- /dev/null
+++ b/test_tipc/results/PP-TSM_CPP/cpp_ppvideo_PP-TSM_results_fp32.txt
@@ -0,0 +1 @@
+./deploy/cpp_infer/example_video_dir/example01.avi class: 5 archery score: 0.988699
diff --git a/test_tipc/results/PP-TSN/python_ppvideo_PP-TSN_results_fp16.txt b/test_tipc/results/PP-TSN/python_ppvideo_PP-TSN_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..57d820f8771e320303d36eb0a574cfa842791444
--- /dev/null
+++ b/test_tipc/results/PP-TSN/python_ppvideo_PP-TSN_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9998562335968018
diff --git a/test_tipc/results/PP-TSN/python_ppvideo_PP-TSN_results_fp32.txt b/test_tipc/results/PP-TSN/python_ppvideo_PP-TSN_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f3751bd514c184092f7cd8a7cb7127bca237ec9f
--- /dev/null
+++ b/test_tipc/results/PP-TSN/python_ppvideo_PP-TSN_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9998553991317749
diff --git a/test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp16.txt b/test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a72619cf1778bfbb8206b7469cbf26d670eccfd9
--- /dev/null
+++ b/test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp16.txt
@@ -0,0 +1 @@
+./deploy/cpp_infer/example_video_dir/example01.avi class: 5 archery score: 0.999323
diff --git a/test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt b/test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1e22968f1193c15ff769f05c85e297b7f5caa6e3
--- /dev/null
+++ b/test_tipc/results/PP-TSN_CPP/cpp_ppvideo_PP-TSN_results_fp32.txt
@@ -0,0 +1 @@
+./deploy/cpp_infer/example_video_dir/example01.avi class: 5 archery score: 0.999315
diff --git a/test_tipc/results/STGCN.txt/python_ppvideo_STGCN_results_fp16.txt b/test_tipc/results/STGCN.txt/python_ppvideo_STGCN_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aa05e5d889714dfeacf2cc5a4e068f99ad182185
--- /dev/null
+++ b/test_tipc/results/STGCN.txt/python_ppvideo_STGCN_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.9912770986557007
diff --git a/test_tipc/results/STGCN.txt/python_ppvideo_STGCN_results_fp32.txt b/test_tipc/results/STGCN.txt/python_ppvideo_STGCN_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aa05e5d889714dfeacf2cc5a4e068f99ad182185
--- /dev/null
+++ b/test_tipc/results/STGCN.txt/python_ppvideo_STGCN_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/fsd10/example_skeleton.npy
+ top-1 class: 27
+ top-1 score: 0.9912770986557007
diff --git a/test_tipc/results/SlowFast/python_ppvideo_SlowFast_results_fp16.txt b/test_tipc/results/SlowFast/python_ppvideo_SlowFast_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9baf852478fc53eb665055d2766531e1132ba1ee
--- /dev/null
+++ b/test_tipc/results/SlowFast/python_ppvideo_SlowFast_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 1.0
diff --git a/test_tipc/results/SlowFast/python_ppvideo_SlowFast_results_fp32.txt b/test_tipc/results/SlowFast/python_ppvideo_SlowFast_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e419ed6e9b6837883155350dd12a56e2323bab95
--- /dev/null
+++ b/test_tipc/results/SlowFast/python_ppvideo_SlowFast_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9999998807907104
diff --git a/test_tipc/results/TSM/python_ppvideo_TSM_results_fp16.txt b/test_tipc/results/TSM/python_ppvideo_TSM_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..df56a1554a95942ae5e43fe536565b2a4502cf4e
--- /dev/null
+++ b/test_tipc/results/TSM/python_ppvideo_TSM_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9999222755432129
diff --git a/test_tipc/results/TSM/python_ppvideo_TSM_results_fp32.txt b/test_tipc/results/TSM/python_ppvideo_TSM_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8e2a425a549bb551f50ce32bf5ccfdad770e97e7
--- /dev/null
+++ b/test_tipc/results/TSM/python_ppvideo_TSM_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9999209642410278
diff --git a/test_tipc/results/TSN/python_ppvideo_TSN_results_fp16.txt b/test_tipc/results/TSN/python_ppvideo_TSN_results_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1f1a8187f6a8bc02f1ea67c605a3f31c120388a5
--- /dev/null
+++ b/test_tipc/results/TSN/python_ppvideo_TSN_results_fp16.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.99908447265625
diff --git a/test_tipc/results/TSN/python_ppvideo_TSN_results_fp32.txt b/test_tipc/results/TSN/python_ppvideo_TSN_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f964d3c35d02e88b8f41b9fa8e3b0663e3766479
--- /dev/null
+++ b/test_tipc/results/TSN/python_ppvideo_TSN_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9990712404251099
diff --git a/test_tipc/results/TimeSformer/python_ppvideo_TimeSformer_results_fp32.txt b/test_tipc/results/TimeSformer/python_ppvideo_TimeSformer_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..281e789c9007a729bb0d08b93c8424e1086a5757
--- /dev/null
+++ b/test_tipc/results/TimeSformer/python_ppvideo_TimeSformer_results_fp32.txt
@@ -0,0 +1,3 @@
+Current video file: data/example.avi
+ top-1 class: 5
+ top-1 score: 0.9997474551200867
diff --git a/test_tipc/test_inference_cpp.sh b/test_tipc/test_inference_cpp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5a83fd3452fb43958aa27fa94ebb5deb36a3e180
--- /dev/null
+++ b/test_tipc/test_inference_cpp.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+dataline=$(awk 'NR==1, NR==18{print}' $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# parser cpp inference model
+model_name=$(func_parser_value "${lines[1]}")
+use_opencv=$(func_parser_value "${lines[2]}")
+cpp_infer_model_dir_list=$(func_parser_value "${lines[3]}")
+cpp_infer_is_quant=$(func_parser_value "${lines[4]}")
+# parser cpp inference
+inference_cmd=$(func_parser_value "${lines[5]}")
+cpp_use_gpu_key=$(func_parser_key "${lines[6]}")
+cpp_use_gpu_list=$(func_parser_value "${lines[6]}")
+cpp_use_mkldnn_key=$(func_parser_key "${lines[7]}")
+cpp_use_mkldnn_list=$(func_parser_value "${lines[7]}")
+cpp_cpu_threads_key=$(func_parser_key "${lines[8]}")
+cpp_cpu_threads_list=$(func_parser_value "${lines[8]}")
+cpp_batch_size_key=$(func_parser_key "${lines[9]}")
+cpp_batch_size_list=$(func_parser_value "${lines[9]}")
+cpp_use_trt_key=$(func_parser_key "${lines[10]}")
+cpp_use_trt_list=$(func_parser_value "${lines[10]}")
+cpp_precision_key=$(func_parser_key "${lines[11]}")
+cpp_precision_list=$(func_parser_value "${lines[11]}")
+cpp_infer_model_key=$(func_parser_key "${lines[12]}")
+cpp_image_dir_key=$(func_parser_key "${lines[13]}")
+cpp_infer_img_dir=$(func_parser_value "${lines[13]}")
+cpp_infer_key1=$(func_parser_key "${lines[14]}")
+cpp_infer_value1=$(func_parser_value "${lines[14]}")
+cpp_benchmark_key=$(func_parser_key "${lines[15]}")
+cpp_benchmark_value=$(func_parser_value "${lines[15]}")
+cpp_infer_key2=$(func_parser_key "${lines[16]}")
+cpp_infer_value2=$(func_parser_value "${lines[16]}")
+cpp_infer_key3=$(func_parser_key "${lines[17]}")
+cpp_infer_value3=$(func_parser_value "${lines[17]}")
+
+LOG_PATH="./test_tipc/output/${model_name}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_cpp.log"
+
+
+function func_cpp_inference(){
+ IFS='|'
+ _script=$1
+ _model_dir=$2
+ _log_path=$3
+ _img_dir=$4
+ _flag_quant=$5
+ # inference
+ for use_gpu in ${cpp_use_gpu_list[*]}; do
+ if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+ for use_mkldnn in ${cpp_use_mkldnn_list[*]}; do
+ if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+ continue
+ fi
+ for threads in ${cpp_cpu_threads_list[*]}; do
+ for batch_size in ${cpp_batch_size_list[*]}; do
+ precision="fp32"
+ if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+ precison="int8"
+ fi
+ _save_log_path="${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
+ set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}")
+ set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}")
+ set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}")
+ set_cpu_threads=$(func_set_params "${cpp_cpu_threads_key}" "${threads}")
+ set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
+ set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
+ set_infer_params2=$(func_set_params "${cpp_infer_key2}" "${cpp_infer_value2}")
+ set_infer_params3=$(func_set_params "${cpp_infer_key3}" "${cpp_infer_value3}")
+ command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${cpp_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 "
+ eval $command
+ last_status=${PIPESTATUS[0]}
+ eval "cat ${_save_log_path}"
+ status_check $last_status "${command}" "${status_log}"
+ done
+ done
+ done
+ elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+ for use_trt in ${cpp_use_trt_list[*]}; do
+ for precision in ${cpp_precision_list[*]}; do
+ if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
+ continue
+ fi
+ if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
+ continue
+ fi
+ if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
+ continue
+ fi
+ for batch_size in ${cpp_batch_size_list[*]}; do
+ _save_log_path="${_log_path}/cpp_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+ set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}")
+ set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}")
+ set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}")
+ set_tensorrt=$(func_set_params "${cpp_use_trt_key}" "${use_trt}")
+ set_precision=$(func_set_params "${cpp_precision_key}" "${precision}")
+ set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
+ set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
+ set_infer_params2=$(func_set_params "${cpp_infer_key2}" "${cpp_infer_value2}")
+ set_infer_params3=$(func_set_params "${cpp_infer_key3}" "${cpp_infer_value3}")
+ command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 "
+ eval $command
+ last_status=${PIPESTATUS[0]}
+ eval "cat ${_save_log_path}"
+ status_check $last_status "${command}" "${status_log}"
+
+ done
+ done
+ done
+ else
+ echo "Does not support hardware other than CPU and GPU Currently!"
+ fi
+ done
+}
+
+
+cd deploy/cpp_infer
+if [ ${use_opencv} = "True" ]; then
+ if [ -d "opencv-3.4.7/opencv3/" ] && [ $(md5sum opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ];then
+ echo "################### build opencv skipped ###################"
+ else
+ echo "################### building opencv ###################"
+ rm -rf opencv-3.4.7.tar.gz opencv-3.4.7/
+ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/opencv-3.4.7.tar.gz
+ tar -xf opencv-3.4.7.tar.gz
+
+ cd opencv-3.4.7/
+ install_path=$(pwd)/opencv3
+
+ rm -rf build
+ mkdir build
+ cd build
+
+ cmake .. \
+ -DCMAKE_INSTALL_PREFIX=${install_path} \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DWITH_IPP=OFF \
+ -DBUILD_IPP_IW=OFF \
+ -DWITH_LAPACK=OFF \
+ -DWITH_EIGEN=OFF \
+ -DCMAKE_INSTALL_LIBDIR=lib64 \
+ -DWITH_ZLIB=ON \
+ -DBUILD_ZLIB=ON \
+ -DWITH_JPEG=ON \
+ -DBUILD_JPEG=ON \
+ -DWITH_PNG=ON \
+ -DBUILD_PNG=ON \
+ -DWITH_TIFF=ON \
+ -DBUILD_TIFF=ON \
+ -DWITH_FFMPEG=ON
+
+ make -j
+ make install
+ cd ../
+ echo "################### building opencv finished ###################"
+ fi
+fi
+
+
+if [ !-d "paddle_inference" ]; then
+ echo "################### download inference lib skipped ###################"
+else
+ echo "################### downloading inference lib ###################"
+ wget -nc https://paddle-inference-lib.bj.bcebos.com/2.1.1-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddle_inference.tgz
+ tar -xf paddle_inference.tgz
+ echo "################### downloading inference lib finished ###################"
+fi
+
+echo "################### building PaddleVideo demo ####################"
+if [ ${use_opencv} = "True" ]; then
+ OPENCV_DIR=$(pwd)/opencv-3.4.7/opencv3
+else
+ OPENCV_DIR=''
+fi
+
+LIB_DIR=$(pwd)/paddle_inference
+CUDA_LIB_DIR=$(dirname `find /usr -name libcudart.so`)
+CUDNN_LIB_DIR=$(dirname `find /usr -name libcudnn.so`)
+
+BUILD_DIR=build
+rm -rf ${BUILD_DIR}
+mkdir ${BUILD_DIR}
+cd ${BUILD_DIR}
+cmake .. \
+ -DPADDLE_LIB=${LIB_DIR} \
+ -DWITH_MKL=ON \
+ -DWITH_GPU=OFF \
+ -DWITH_STATIC_LIB=OFF \
+ -DWITH_TENSORRT=OFF \
+ -DOPENCV_DIR=${OPENCV_DIR} \
+ -DCUDNN_LIB=${CUDNN_LIB_DIR} \
+ -DCUDA_LIB=${CUDA_LIB_DIR} \
+ -DTENSORRT_DIR=${TENSORRT_DIR} \
+
+make -j
+cd ../../../
+echo "################### building PaddleVideo demo finished ###################"
+
+
+# set cuda device
+GPUID=$2
+if [ ${#GPUID} -le 0 ];then
+ env=" "
+else
+ env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+fi
+set CUDA_VISIBLE_DEVICES
+eval $env
+
+
+echo "################### running test ###################"
+export Count=0
+IFS="|"
+infer_quant_flag=(${cpp_infer_is_quant})
+for infer_model in ${cpp_infer_model_dir_list[*]}; do
+ #run inference
+ is_quant=${infer_quant_flag[Count]}
+ func_cpp_inference "${inference_cmd}" "${infer_model}" "${LOG_PATH}" "${cpp_infer_img_dir}" ${is_quant}
+ Count=$(($Count + 1))
+done
diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh
new file mode 100644
index 0000000000000000000000000000000000000000..beb8c0666a018dc90294731b69e316d043ce3f9b
--- /dev/null
+++ b/test_tipc/test_train_inference_python.sh
@@ -0,0 +1,413 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer']
+MODE=$2
+
+dataline=$(awk 'NR==1, NR==51{print}' $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+gpu_list=$(func_parser_value "${lines[3]}")
+train_use_gpu_key=$(func_parser_key "${lines[4]}")
+train_use_gpu_value=$(func_parser_value "${lines[4]}")
+autocast_list=$(func_parser_value "${lines[5]}")
+autocast_key=$(func_parser_key "${lines[5]}")
+epoch_key=$(func_parser_key "${lines[6]}")
+epoch_num=$(func_parser_value "${lines[6]}")
+save_model_key=$(func_parser_key "${lines[7]}")
+train_batch_key=$(func_parser_key "${lines[8]}")
+train_batch_value=$(func_parser_value "${lines[8]}")
+pretrain_model_key=$(func_parser_key "${lines[9]}")
+pretrain_model_value=$(func_parser_value "${lines[9]}")
+train_model_name=$(func_parser_value "${lines[10]}")
+train_param_key1=$(func_parser_key "${lines[12]}")
+train_param_value1=$(func_parser_value "${lines[12]}")
+train_param_key2=$(func_parser_key "${lines[11]}")
+train_param_value2=$(func_parser_value "${lines[11]}")
+
+trainer_list=$(func_parser_value "${lines[14]}")
+trainer_norm=$(func_parser_key "${lines[15]}")
+norm_trainer=$(func_parser_value "${lines[15]}")
+pact_key=$(func_parser_key "${lines[16]}")
+pact_trainer=$(func_parser_value "${lines[16]}")
+fpgm_key=$(func_parser_key "${lines[17]}")
+fpgm_trainer=$(func_parser_value "${lines[17]}")
+distill_key=$(func_parser_key "${lines[18]}")
+distill_trainer=$(func_parser_value "${lines[18]}")
+amp_key=$(func_parser_key "${lines[19]}")
+amp_trainer=$(func_parser_value "${lines[19]}")
+trainer_key2=$(func_parser_key "${lines[20]}")
+trainer_value2=$(func_parser_value "${lines[20]}")
+
+eval_py=$(func_parser_value "${lines[23]}")
+eval_key1=$(func_parser_key "${lines[24]}")
+eval_value1=$(func_parser_value "${lines[24]}")
+
+save_infer_key=$(func_parser_key "${lines[27]}")
+save_infer_value=$(func_parser_value "${lines[27]}")
+
+export_weight=$(func_parser_key "${lines[28]}")
+norm_export=$(func_parser_value "${lines[29]}")
+pact_export=$(func_parser_value "${lines[30]}")
+fpgm_export=$(func_parser_value "${lines[31]}")
+distill_export=$(func_parser_value "${lines[32]}")
+export_key1=$(func_parser_key "${lines[33]}")
+export_value1=$(func_parser_value "${lines[33]}")
+export_key2=$(func_parser_key "${lines[34]}")
+export_value2=$(func_parser_value "${lines[34]}")
+inference_dir=$(func_parser_value "${lines[35]}")
+
+# parser inference model
+infer_model_dir_list=$(func_parser_value "${lines[36]}")
+infer_export_list=$(func_parser_value "${lines[37]}")
+infer_is_quant=$(func_parser_value "${lines[38]}")
+# parser inference
+inference_py=$(func_parser_value "${lines[39]}")
+use_gpu_key=$(func_parser_key "${lines[40]}")
+use_gpu_list=$(func_parser_value "${lines[40]}")
+use_mkldnn_key=$(func_parser_key "${lines[41]}")
+use_mkldnn_list=$(func_parser_value "${lines[41]}")
+cpu_threads_key=$(func_parser_key "${lines[42]}")
+cpu_threads_list=$(func_parser_value "${lines[42]}")
+batch_size_key=$(func_parser_key "${lines[43]}")
+batch_size_list=$(func_parser_value "${lines[43]}")
+use_trt_key=$(func_parser_key "${lines[44]}")
+use_trt_list=$(func_parser_value "${lines[44]}")
+precision_key=$(func_parser_key "${lines[45]}")
+precision_list=$(func_parser_value "${lines[45]}")
+infer_model_key=$(func_parser_key "${lines[46]}")
+infer_model_value=$(func_parser_value "${lines[46]}")
+
+video_dir_key=$(func_parser_key "${lines[47]}")
+infer_video_dir=$(func_parser_value "${lines[47]}")
+save_log_key=$(func_parser_key "${lines[48]}")
+benchmark_key=$(func_parser_key "${lines[49]}")
+benchmark_value=$(func_parser_value "${lines[49]}")
+
+infer_key1=$(func_parser_key "${lines[50]}")
+infer_value1=$(func_parser_value "${lines[50]}")
+
+# parser klquant_infer
+if [ ${MODE} = "klquant_whole_infer" ]; then
+ dataline=$(awk 'NR==1 NR==17{print}' $FILENAME)
+ lines=(${dataline})
+ model_name=$(func_parser_value "${lines[1]}")
+ python=$(func_parser_value "${lines[2]}")
+ # parser inference model
+ infer_model_dir_list=$(func_parser_value "${lines[3]}")
+ infer_export_list=$(func_parser_value "${lines[4]}")
+ infer_is_quant=$(func_parser_value "${lines[5]}")
+ # parser inference
+ inference_py=$(func_parser_value "${lines[6]}")
+ use_gpu_key=$(func_parser_key "${lines[7]}")
+ use_gpu_list=$(func_parser_value "${lines[7]}")
+ use_mkldnn_key=$(func_parser_key "${lines[8]}")
+ use_mkldnn_list=$(func_parser_value "${lines[8]}")
+ cpu_threads_key=$(func_parser_key "${lines[9]}")
+ cpu_threads_list=$(func_parser_value "${lines[9]}")
+ batch_size_key=$(func_parser_key "${lines[10]}")
+ batch_size_list=$(func_parser_value "${lines[10]}")
+ use_trt_key=$(func_parser_key "${lines[11]}")
+ use_trt_list=$(func_parser_value "${lines[11]}")
+ precision_key=$(func_parser_key "${lines[12]}")
+ precision_list=$(func_parser_value "${lines[12]}")
+ infer_model_key=$(func_parser_key "${lines[13]}")
+ video_dir_key=$(func_parser_key "${lines[14]}")
+ infer_video_dir=$(func_parser_value "${lines[14]}")
+ save_log_key=$(func_parser_key "${lines[15]}")
+ benchmark_key=$(func_parser_key "${lines[16]}")
+ benchmark_value=$(func_parser_value "${lines[16]}")
+ infer_key1=$(func_parser_key "${lines[17]}")
+ infer_value1=$(func_parser_value "${lines[17]}")
+fi
+
+LOG_PATH="./test_tipc/output/${model_name}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_python.log"
+
+
+function func_inference(){
+ IFS='|'
+ _python=$1
+ _script=$2
+ _model_dir=$3
+ _log_path=$4
+ _video_dir=$5
+ _flag_quant=$6
+ # inference
+ for use_gpu in ${use_gpu_list[*]}; do
+ if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+ for use_mkldnn in ${use_mkldnn_list[*]}; do
+ if [[ ${use_mkldnn} = "False" ]] && [[ ${_flag_quant} = "True" ]]; then
+ continue
+ fi
+ for threads in ${cpu_threads_list[*]}; do
+ for batch_size in ${batch_size_list[*]}; do
+ for precision in ${precision_list[*]}; do
+ if [[ ${use_mkldnn} = "False" ]] && [[ ${precision} = "fp16" ]]; then
+ continue
+ fi # skip when enable fp16 but disable mkldnn
+ if [[ ${_flag_quant} = "True" ]] && [[ ${precision} != "int8" ]]; then
+ continue
+ fi # skip when quant model inference but precision is not int8
+ set_precision=$(func_set_params "${precision_key}" "${precision}")
+
+ _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
+ mkdir -p ${_log_path}
+ set_infer_data=$(func_set_params "${video_dir_key}" "${infer_video_dir}")
+ set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+ set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+ set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
+ set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}/${infer_model_value}")
+ set_infer_params1=$(func_set_params "${infer_key1}" "${_model_dir}/${infer_value1}")
+ command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+ eval $command
+ last_status=${PIPESTATUS[0]}
+ eval "cat ${_save_log_path}"
+ status_check $last_status "${command}" "${status_log}"
+ done
+ done
+ done
+ done
+ elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+ for use_trt in ${use_trt_list[*]}; do
+ for precision in ${precision_list[*]}; do
+ if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
+ continue
+ fi
+ if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [[ ${use_trt} = "False" ]]; then
+ continue
+ fi
+ if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [[ ${_flag_quant} = "True" ]]; then
+ continue
+ fi
+ for batch_size in ${batch_size_list[*]}; do
+ _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+ set_infer_data=$(func_set_params "${video_dir_key}" "${infer_video_dir}")
+
+ set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+ set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+ set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}")
+ set_precision=$(func_set_params "${precision_key}" "${precision}")
+ set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}/${infer_model_value}")
+ set_infer_params1=$(func_set_params "${infer_key1}" "${_model_dir}/${infer_value1}")
+ command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+
+ eval $command
+
+ last_status=${PIPESTATUS[0]}
+ eval "cat ${_save_log_path}"
+ status_check $last_status "${command}" "${status_log}"
+
+ done
+ done
+ done
+ else
+ echo "Does not support hardware other than CPU and GPU Currently!"
+ fi
+ done
+}
+
+if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
+ GPUID=$3
+ if [ ${#GPUID} -le 0 ];then
+ env=" "
+ else
+ env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+ fi
+ set CUDA_VISIBLE_DEVICES
+ eval $env
+ export Count=0
+ IFS="|"
+ infer_run_exports=(${infer_export_list})
+ infer_quant_flag=(${infer_is_quant})
+ for infer_model in ${infer_model_dir_list[*]}; do
+ # run export
+ if [ ${infer_run_exports[Count]} != "null" ];then
+ save_infer_dir=$(dirname $infer_model)
+ set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
+ set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
+ export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}"
+ echo ${infer_run_exports[Count]}
+ eval $export_cmd
+ echo $export_cmd
+ status_export=$?
+ status_check $status_export "${export_cmd}" "${status_log}"
+ else
+ save_infer_dir=${infer_model}
+ fi
+ #run inference
+ is_quant=${infer_quant_flag[Count]}
+ if [ ${MODE} = "klquant_infer" ]; then
+ is_quant="True"
+ fi
+ func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_video_dir}" ${is_quant}
+ Count=$(($Count + 1))
+ done
+else
+ IFS="|"
+ export Count=0
+ USE_GPU_KEY=(${train_use_gpu_value})
+ for gpu in ${gpu_list[*]}; do
+ train_use_gpu=${USE_GPU_KEY[Count]}
+ Count=$(($Count + 1))
+ ips=""
+ if [ ${gpu} = "-1" ];then
+ env=""
+ elif [ ${#gpu} -le 1 ];then
+ env="export CUDA_VISIBLE_DEVICES=${gpu}"
+ eval ${env}
+ elif [ ${#gpu} -le 15 ];then
+ IFS=","
+ array=(${gpu})
+ env="export CUDA_VISIBLE_DEVICES=${array[0]}"
+ IFS="|"
+ else
+ IFS=";"
+ array=(${gpu})
+ ips=${array[0]}
+ gpu=${array[1]}
+ IFS="|"
+ env=" "
+ fi
+ for autocast in ${autocast_list[*]}; do
+ if [ ${autocast} = "fp16" ]; then
+ set_amp_config="--amp"
+ else
+ set_amp_config=" "
+ fi
+ for trainer in ${trainer_list[*]}; do
+ flag_quant=False
+ if [ ${trainer} = ${pact_key} ]; then
+ run_train=${pact_trainer}
+ run_export=${pact_export}
+ flag_quant=True
+ elif [ ${trainer} = "${fpgm_key}" ]; then
+ run_train=${fpgm_trainer}
+ run_export=${fpgm_export}
+ elif [ ${trainer} = "${distill_key}" ]; then
+ run_train=${distill_trainer}
+ run_export=${distill_export}
+ elif [ ${trainer} = ${amp_key} ]; then
+ run_train=${amp_trainer}
+ run_export=${norm_export}
+ elif [[ ${trainer} = ${trainer_key2} ]]; then
+ run_train=${trainer_value2}
+ run_export=${export_value2}
+ else
+ run_train=${norm_trainer}
+ run_export=${norm_export}
+ fi
+
+ if [ ${run_train} = "null" ]; then
+ continue
+ fi
+ if [[ ${MODE} != "benchmark_train" ]] && [[ ! ${MODE} =~ "whole_train" ]]; then
+ # 训练参数末尾加上--max_iters=30和--log_interval=1,以便运行并输出足量数据
+ run_train=${run_train}" --max_iters=30"
+ fi
+ set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
+ set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
+
+ if [[ $MODE =~ "whole_train" ]]; then
+ set_epoch=""
+ fi
+
+ set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
+ if [[ $MODE =~ "whole_train" ]]; then
+ train_batch_key=""
+ train_batch_value=""
+ fi
+ set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
+ if [[ $MODE =~ "whole_train" ]]; then
+ train_param_key1=""
+ train_param_value1=""
+ fi
+ set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
+ if [[ $MODE =~ "whole_train" ]]; then
+ train_param_key2=""
+ train_param_value2=""
+ fi
+ set_train_params2=$(func_set_params "${train_param_key2}" "${train_param_value2}")
+ set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}")
+ if [ ${#ips} -le 26 ];then
+ save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
+ nodes=1
+ else
+ IFS=","
+ ips_array=(${ips})
+ IFS="|"
+ nodes=${#ips_array[@]}
+ save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
+ fi
+
+ # load pretrain from norm training if current trainer is pact or fpgm trainer
+ if ([ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]) && [ ${nodes} -le 1 ]; then
+ set_pretrain="${load_norm_train_model}"
+ fi
+
+ set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
+ if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
+ cmd="${python} ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} "
+ elif [ ${#ips} -le 26 ];then # train with multi-gpu
+ cmd="${python} -B -m paddle.distributed.launch --gpus=\"${gpu}\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} "
+ else # train with multi-machine
+ cmd="${python} -B -m paddle.distributed.launch --ips=${ips} --gpus=\"${gpu}\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_train_params1} ${set_train_params2} "
+ fi
+
+ # run train
+ eval "unset CUDA_VISIBLE_DEVICES"
+ eval $cmd
+ status_check $? "${cmd}" "${status_log}"
+
+ # set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}")
+ # save norm trained models to set pretrain for pact training and fpgm training
+ if [ [${trainer} = ${trainer_norm}] ] && [ [${nodes} -le 1] ]; then
+ load_norm_train_model=${set_eval_pretrain}
+ fi
+ # run test
+ if [ ${eval_py} != "null" ]; then
+ real_model_name=${model_name/PP-/pp}
+ set_eval_params1=$(func_set_params "${eval_key1}" "${save_log}/${real_model_name}_epoch_00001.pdparams")
+ if [[ $MODE =~ "lite_infer" ]] && [[ ${train_param_key1} != "null" ]]; then
+ eval_cmd="${python} ${eval_py} ${set_use_gpu} ${set_eval_params1} ${train_param_key1}=${train_param_value1}"
+ else
+ eval_cmd="${python} ${eval_py} ${set_use_gpu} ${set_eval_params1}"
+ fi
+ eval $eval_cmd
+ status_check $? "${eval_cmd}" "${status_log}"
+ fi
+ # run export model
+ if [ ${run_export} != "null" ]; then
+ save_infer_path="${save_log}"
+ real_model_name=${model_name/PP-/pp}
+ set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${real_model_name}_epoch_00001.pdparams")
+
+ set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_log}")
+ export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key}"
+ eval $export_cmd
+ status_check $? "${export_cmd}" "${status_log}"
+
+ #run inference
+ eval $env
+ save_infer_path="${save_log}"
+ if [ ${inference_dir} != "null" ] && [ ${inference_dir} != '##' ]; then
+ infer_model_dir=${save_infer_path}
+ else
+ infer_model_dir=${save_infer_path}
+ fi
+ func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${flag_quant}"
+
+ eval "unset CUDA_VISIBLE_DEVICES"
+ fi
+ done # done with: for trainer in ${trainer_list[*]}; do
+ done # done with: for autocast in ${autocast_list[*]}; do
+ done # done with: for gpu in ${gpu_list[*]}; do
+fi # end if [ ${MODE} = "infer" ]; then
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6bd235ccafd4bd452a0f4ef7d92289a0461af0f
--- /dev/null
+++ b/tools/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import utils
+from .paddlevideo_clas import PaddleVideo
+from . import ava_predict
diff --git a/tools/ava_predict.py b/tools/ava_predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..04e5e377395f2b75683e686a860de4aa0d711646
--- /dev/null
+++ b/tools/ava_predict.py
@@ -0,0 +1,508 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import paddle
+import os, sys
+import copy as cp
+import cv2
+import math
+try:
+ import ppdet
+except ImportError as e:
+ print(
+ f"{e}, [paddledet] package and it's dependencies is required for AVA.")
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline
+from paddlevideo.metrics.ava_utils import read_labelmap
+
+import time
+from os import path as osp
+import numpy as np
+from paddlevideo.utils import get_config
+import pickle
+
+from paddlevideo.utils import (get_logger, load, mkdir, save)
+import shutil
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255) # BGR, white
+MSGCOLOR = (128, 128, 128) # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+ """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+ return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def abbrev(name):
+ """Get the abbreviation of label name:
+ 'take (an object) from (a person)' -> 'take ... from ...'
+ """
+ while name.find('(') != -1:
+ st, ed = name.find('('), name.find(')')
+ name = name[:st] + '...' + name[ed + 1:]
+ return name
+
+
+# annotations is pred results
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+ """Visualize frames with predicted annotations.
+ Args:
+ frames (list[np.ndarray]): Frames for visualization, note that
+ len(frames) % len(annotations) should be 0.
+ annotations (list[list[tuple]]): The predicted results.
+ plate (str): The plate used for visualization. Default: plate_blue.
+ max_num (int): Max number of labels to visualize for a person box.
+ Default: 5,目前不能大于5.
+ Returns:
+ list[np.ndarray]: Visualized frames.
+ """
+
+ assert max_num + 1 <= len(plate)
+ plate = [x[::-1] for x in plate]
+ frames_ = cp.deepcopy(frames)
+ nf, na = len(frames), len(annotations)
+ assert nf % na == 0
+ nfpa = len(frames) // len(annotations)
+ anno = None
+ h, w, _ = frames[0].shape
+ # proposals被归一化需要还原真实坐标值
+ scale_ratio = np.array([w, h, w, h])
+
+ for i in range(na):
+ anno = annotations[i]
+ if anno is None:
+ continue
+ for j in range(nfpa):
+ ind = i * nfpa + j
+ frame = frames_[ind]
+ for ann in anno:
+ box = ann[0]
+ label = ann[1]
+ if not len(label):
+ continue
+ score = ann[2]
+ box = (box * scale_ratio).astype(np.int64)
+ st, ed = tuple(box[:2]), tuple(box[2:])
+ cv2.rectangle(frame, st, ed, plate[0], 2)
+ for k, lb in enumerate(label):
+ if k >= max_num:
+ break
+ text = abbrev(lb)
+ text = ': '.join([text, str(score[k])])
+ location = (0 + st[0], 18 + k * 18 + st[1])
+ textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+ THICKNESS)[0]
+ textwidth = textsize[0]
+ diag0 = (location[0] + textwidth, location[1] - 14)
+ diag1 = (location[0], location[1] + 2)
+ cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+ cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+ FONTCOLOR, THICKNESS, LINETYPE)
+
+ return frames_
+
+
+def frame_extraction(video_path, target_dir):
+ """Extract frames given video_path.
+ Args:
+ video_path (str): The video_path.
+ """
+
+ if not os.path.exists(target_dir):
+ os.makedirs(target_dir, exist_ok=True)
+
+ # Should be able to handle videos up to several hours
+ frame_tmpl = osp.join(target_dir, '{:05d}.jpg')
+ vid = cv2.VideoCapture(video_path)
+
+ FPS = int(vid.get(5))
+
+ frames = []
+ frame_paths = []
+
+ flag, frame = vid.read()
+ index = 1
+ while flag:
+ frames.append(frame)
+ frame_path = frame_tmpl.format(index)
+ frame_paths.append(frame_path)
+ cv2.imwrite(frame_path, frame)
+ index += 1
+ flag, frame = vid.read()
+ return frame_paths, frames, FPS
+
+
+def parse_args():
+ def str2bool(v):
+ return v.lower() in ("true", "t", "1")
+
+ # general params
+ parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+ parser.add_argument('-c',
+ '--config',
+ type=str,
+ default='configs/example.yaml',
+ help='config file path')
+
+ parser.add_argument('--video_path', help='video file/url')
+
+ parser.add_argument('-o',
+ '--override',
+ action='append',
+ default=[],
+ help='config options to be overridden')
+ parser.add_argument('-w',
+ '--weights',
+ type=str,
+ help='weights for finetuning or testing')
+
+ #detection_model_name
+ parser.add_argument('--detection_model_name',
+ help='the name of detection model ')
+ # detection_model_weights
+ parser.add_argument('--detection_model_weights',
+ help='the weights path of detection model ')
+
+ # params for predict
+ parser.add_argument('--out-filename',
+ default='ava_det_demo.mp4',
+ help='output filename')
+ parser.add_argument('--predict-stepsize',
+ default=8,
+ type=int,
+ help='give out a prediction per n frames')
+ parser.add_argument(
+ '--output-stepsize',
+ default=4,
+ type=int,
+ help=('show one frame per n frames in the demo, we should have: '
+ 'predict_stepsize % output_stepsize == 0'))
+ parser.add_argument('--output-fps',
+ default=6,
+ type=int,
+ help='the fps of demo video output')
+
+ return parser.parse_args()
+
+
+# 一帧的结果。根据概率大小进行排序
+def pack_result(human_detection, result):
+ """Short summary.
+ Args:
+ human_detection (np.ndarray): Human detection result.
+ result (type): The predicted label of each human proposal.
+ Returns:
+ tuple: Tuple of human proposal, label name and label score.
+ """
+ results = []
+ if result is None:
+ return None
+
+ for prop, res in zip(human_detection, result):
+ res.sort(key=lambda x: -x[1])
+
+ results.append((prop, [x[0] for x in res], [x[1] for x in res]))
+
+ return results
+
+
+# 构造数据处理需要的results
+def get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS):
+ result = {}
+
+ result["frame_dir"] = frame_dir
+
+ frame_num = len(os.listdir(frame_dir))
+
+ dir_name = frame_dir.split("/")[-1]
+ result["video_id"] = dir_name
+
+ result['timestamp'] = timestamp
+
+ timestamp_str = '{:04d}'.format(timestamp)
+ img_key = dir_name + "," + timestamp_str
+ result['img_key'] = img_key
+
+ result['shot_info'] = (1, frame_num)
+ result['fps'] = FPS
+
+ result['suffix'] = '{:05}.jpg'
+
+ result['timestamp_start'] = 1
+ result['timestamp_end'] = int(frame_num / result['fps'])
+
+ return result
+
+
+def detection_inference(frame_paths, output_dir, model_name, weights_path):
+ """Detect human boxes given frame paths.
+ Args:
+ frame_paths (list[str]): The paths of frames to do detection inference.
+ Returns:
+ list[np.ndarray]: The human detection results.
+ """
+
+ detection_cfg = ppdet.model_zoo.get_config_file(model_name)
+ detection_cfg = ppdet.core.workspace.load_config(detection_cfg)
+ detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test')
+ detection_trainer.load_weights(weights_path)
+
+ print('Performing Human Detection for each frame')
+
+ detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True)
+
+ print("finish object detection")
+
+ results = []
+
+ for frame_path in frame_paths:
+ (file_dir, file_name) = os.path.split(frame_path)
+ (file_path, ext) = os.path.splitext(frame_path)
+
+ txt_file_name = file_name.replace(ext, ".txt")
+ txt_path = os.path.join(output_dir, txt_file_name)
+ results.append(txt_path)
+
+ return results
+
+
+def get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr):
+ """
+ 根据检测结果文件得到图像中人的检测框(proposals)和置信度(scores)
+ txt_file_path:检测结果存放路径
+ img_h:图像高度
+ img_w:图像宽度
+ """
+
+ proposals = []
+ scores = []
+
+ with open(txt_file_path, 'r') as detection_file:
+ lines = detection_file.readlines()
+ for line in lines: # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375
+ items = line.split(" ")
+ if items[0] != 'person': #只要人
+ continue
+
+ score = items[1]
+
+ if (float)(score) < person_det_score_thr:
+ continue
+
+ x1 = (float(items[2])) / img_w
+ y1 = ((float)(items[3])) / img_h
+ box_w = ((float)(items[4]))
+ box_h = ((float)(items[5]))
+
+ x2 = (float(items[2]) + box_w) / img_w
+ y2 = (float(items[3]) + box_h) / img_h
+
+ scores.append(score)
+
+ proposals.append([x1, y1, x2, y2])
+
+ return np.array(proposals), np.array(scores)
+
+
+@paddle.no_grad()
+def main(args):
+ config = get_config(args.config, show=False) #parse config file
+
+ # extract frames from video
+ video_path = args.video_path
+ frame_dir = 'tmp_frames'
+ frame_paths, frames, FPS = frame_extraction(video_path, frame_dir)
+
+ num_frame = len(frame_paths) #视频秒数*FPS
+ assert num_frame != 0
+ print("Frame Number:", num_frame)
+
+ # 帧图像高度和宽度
+ h, w, _ = frames[0].shape
+
+ # Get clip_len, frame_interval and calculate center index of each clip
+ data_process_pipeline = build_pipeline(config.PIPELINE.test) #测试时输出处理流水配置
+
+ clip_len = config.PIPELINE.test.sample['clip_len']
+ assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+ frame_interval = config.PIPELINE.test.sample['frame_interval']
+
+ # 此处关键帧每秒取一个
+ clip_len = config.PIPELINE.test.sample['clip_len']
+ assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+ frame_interval = config.PIPELINE.test.sample['frame_interval']
+ window_size = clip_len * frame_interval
+ timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2),
+ args.predict_stepsize)
+ print("timetamps number:", len(timestamps))
+
+ # get selected frame list according to timestamps
+ selected_frame_list = []
+ for timestamp in timestamps:
+ selected_frame_list.append(frame_paths[timestamp - 1])
+
+ # Load label_map
+ label_map_path = config.DATASET.test['label_file']
+ categories, class_whitelist = read_labelmap(open(label_map_path))
+ label_map = {}
+ for item in categories:
+ id = item['id']
+ name = item['name']
+ label_map[id] = name
+
+ # Construct model.
+ if config.MODEL.backbone.get('pretrained'):
+ config.MODEL.backbone.pretrained = '' # disable pretrain model init
+ model = build_model(config.MODEL)
+
+ model.eval()
+ state_dicts = load(args.weights)
+ model.set_state_dict(state_dicts)
+
+ detection_result_dir = 'tmp_detection'
+ detection_model_name = args.detection_model_name
+ detection_model_weights = args.detection_model_weights
+ detection_txt_list = detection_inference(selected_frame_list,
+ detection_result_dir,
+ detection_model_name,
+ detection_model_weights)
+ assert len(detection_txt_list) == len(timestamps)
+
+ print('Performing SpatioTemporal Action Detection for each clip')
+ human_detections = []
+ predictions = []
+
+ index = 0
+ for timestamp, detection_txt_path in zip(timestamps, detection_txt_list):
+ proposals, scores = get_detection_result(
+ detection_txt_path, h, w,
+ (float)(config.DATASET.test['person_det_score_thr']))
+ if proposals.shape[0] == 0:
+ predictions.append(None)
+ human_detections.append(None)
+ continue
+
+ human_detections.append(proposals)
+
+ result = get_timestep_result(frame_dir,
+ timestamp,
+ clip_len,
+ frame_interval,
+ FPS=FPS)
+ result["proposals"] = proposals
+ result["scores"] = scores
+
+ new_result = data_process_pipeline(result)
+ proposals = new_result['proposals']
+
+ img_slow = new_result['imgs'][0]
+ img_slow = img_slow[np.newaxis, :]
+ img_fast = new_result['imgs'][1]
+ img_fast = img_fast[np.newaxis, :]
+
+ proposals = proposals[np.newaxis, :]
+
+ scores = scores[np.newaxis, :]
+
+ img_shape = np.asarray(new_result['img_shape'])
+ img_shape = img_shape[np.newaxis, :]
+
+ data = [
+ paddle.to_tensor(img_slow, dtype='float32'),
+ paddle.to_tensor(img_fast, dtype='float32'),
+ paddle.to_tensor(proposals, dtype='float32'), scores,
+ paddle.to_tensor(img_shape, dtype='int32')
+ ]
+
+ with paddle.no_grad():
+ result = model(data, mode='infer')
+
+ result = result[0]
+ prediction = []
+
+ person_num = proposals.shape[1]
+ # N proposals
+ for i in range(person_num):
+ prediction.append([])
+
+ # Perform action score thr
+ for i in range(len(result)):
+ if i + 1 not in class_whitelist:
+ continue
+ for j in range(person_num):
+ if result[i][j, 4] > config.MODEL.head['action_thr']:
+ prediction[j].append((label_map[i + 1], result[i][j,
+ 4]))
+ predictions.append(prediction)
+
+ index = index + 1
+ if index % 10 == 0:
+ print(index, "/", len(timestamps))
+
+ results = []
+ for human_detection, prediction in zip(human_detections, predictions):
+ results.append(pack_result(human_detection, prediction))
+
+ def dense_timestamps(timestamps, n):
+ """Make it nx frames."""
+ old_frame_interval = (timestamps[1] - timestamps[0])
+ start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+ new_frame_inds = np.arange(
+ len(timestamps) * n) * old_frame_interval / n + start
+ return new_frame_inds.astype(np.int)
+
+ dense_n = int(args.predict_stepsize / args.output_stepsize) #30
+ frames = [
+ cv2.imread(frame_paths[i - 1])
+ for i in dense_timestamps(timestamps, dense_n)
+ ]
+
+ vis_frames = visualize(frames, results)
+
+ try:
+ import moviepy.editor as mpy
+ except ImportError:
+ raise ImportError('Please install moviepy to enable output file')
+
+ vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+ fps=args.output_fps)
+ vid.write_videofile(args.out_filename)
+ print("finish write !")
+
+ # delete tmp files and dirs
+ shutil.rmtree(frame_dir)
+ shutil.rmtree(detection_result_dir)
+
+
+if __name__ == '__main__':
+ args = parse_args() #解析参数
+ main(args)
diff --git a/tools/export_model.py b/tools/export_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cdbaaa00f78747f67d8d8300463a87be4021c1f
--- /dev/null
+++ b/tools/export_model.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import os.path as osp
+import sys
+
+import paddle
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+ parser = argparse.ArgumentParser("PaddleVideo export model script")
+ parser.add_argument('-c',
+ '--config',
+ type=str,
+ default='configs/example.yaml',
+ help='config file path')
+ parser.add_argument("-p",
+ "--pretrained_params",
+ default='./best.pdparams',
+ type=str,
+ help='params path')
+ parser.add_argument("-o",
+ "--output_path",
+ type=str,
+ default="./inference",
+ help='output path')
+
+ parser.add_argument('--save_name',
+ type=str,
+ default=None,
+ help='specify the exported inference \
+ files(pdiparams and pdmodel) name,\
+ only used in TIPC')
+
+ return parser.parse_args()
+
+
+def trim_config(cfg):
+ """
+ Reuse the trainging config will bring useless attributes, such as: backbone.pretrained model.
+ and some build phase attributes should be overrided, such as: backbone.num_seg.
+ Trim it here.
+ """
+ model_name = cfg.model_name
+ if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):
+ cfg.MODEL.backbone.pretrained = "" # not ued when inference
+
+ return cfg, model_name
+
+
+def get_input_spec(cfg, model_name):
+ if model_name in ['ppTSM', 'TSM', 'MoViNet']:
+ input_spec = [[
+ InputSpec(
+ shape=[None, cfg.num_seg, 3, cfg.target_size, cfg.target_size],
+ dtype='float32'),
+ ]]
+ elif model_name in ['TSN', 'ppTSN']:
+ input_spec = [[
+ InputSpec(shape=[
+ None, cfg.num_seg * 10, 3, cfg.target_size, cfg.target_size
+ ],
+ dtype='float32'),
+ ]]
+ elif model_name in ['BMN']:
+ input_spec = [[
+ InputSpec(shape=[None, cfg.feat_dim, cfg.tscale],
+ dtype='float32',
+ name='feat_input'),
+ ]]
+ elif model_name in ['TimeSformer', 'ppTimeSformer']:
+ input_spec = [[
+ InputSpec(shape=[
+ None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size
+ ],
+ dtype='float32'),
+ ]]
+ elif model_name in ['VideoSwin']:
+ input_spec = [[
+ InputSpec(shape=[
+ None, 3, cfg.num_seg * cfg.seg_len * 1, cfg.target_size,
+ cfg.target_size
+ ],
+ dtype='float32'),
+ ]]
+ elif model_name in ['VideoSwin_TableTennis']:
+ input_spec = [[
+ InputSpec(shape=[
+ None, 3, cfg.num_seg * cfg.seg_len * 3, cfg.target_size,
+ cfg.target_size
+ ],
+ dtype='float32'),
+ ]]
+ elif model_name in ['AttentionLSTM']:
+ input_spec = [[
+ InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
+ dtype='float32'), # for rgb_data
+ InputSpec(shape=[
+ None,
+ ], dtype='int64'), # for rgb_len
+ InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
+ dtype='float32'), # for rgb_mask
+ InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
+ dtype='float32'), # for audio_data
+ InputSpec(shape=[
+ None,
+ ], dtype='int64'), # for audio_len
+ InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
+ dtype='float32'), # for audio_mask
+ ]]
+ elif model_name in ['SlowFast']:
+ input_spec = [[
+ InputSpec(shape=[
+ None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
+ cfg.target_size
+ ],
+ dtype='float32',
+ name='slow_input'),
+ InputSpec(shape=[
+ None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
+ ],
+ dtype='float32',
+ name='fast_input'),
+ ]]
+ elif model_name in ['STGCN', 'AGCN', 'CTRGCN']:
+ input_spec = [[
+ InputSpec(shape=[
+ None, cfg.num_channels, cfg.window_size, cfg.vertex_nums,
+ cfg.person_nums
+ ],
+ dtype='float32'),
+ ]]
+ elif model_name in ['TransNetV2']:
+ input_spec = [[
+ InputSpec(shape=[
+ None,
+ cfg.num_frames,
+ cfg.height,
+ cfg.width,
+ cfg.num_channels,
+ ],
+ dtype='float32'),
+ ]]
+ elif model_name in ['MSTCN', 'ASRF']:
+ input_spec = [[
+ InputSpec(shape=[None, cfg.num_channels, None], dtype='float32'),
+ ]]
+ elif model_name in ['ADDS']:
+ input_spec = [[
+ InputSpec(shape=[None, cfg.num_channels, cfg.height, cfg.width],
+ dtype='float32'),
+ ]]
+ elif model_name in ['AVA_SlowFast_FastRcnn']:
+ input_spec = [[
+ InputSpec(shape=[
+ None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
+ cfg.target_size
+ ],
+ dtype='float32',
+ name='slow_input'),
+ InputSpec(shape=[
+ None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
+ ],
+ dtype='float32',
+ name='fast_input'),
+ InputSpec(shape=[None, None, 4], dtype='float32', name='proposals'),
+ InputSpec(shape=[None, 2], dtype='float32', name='img_shape')
+ ]]
+ return input_spec
+
+
+def main():
+ args = parse_args()
+ cfg, model_name = trim_config(get_config(args.config, show=False))
+ print(f"Building model({model_name})...")
+ model = build_model(cfg.MODEL)
+ assert osp.isfile(
+ args.pretrained_params
+ ), f"pretrained params ({args.pretrained_params} is not a file path.)"
+
+ if not os.path.isdir(args.output_path):
+ os.makedirs(args.output_path)
+
+ print(f"Loading params from ({args.pretrained_params})...")
+ params = paddle.load(args.pretrained_params)
+ model.set_dict(params)
+
+ model.eval()
+
+ input_spec = get_input_spec(cfg.INFERENCE, model_name)
+ model = to_static(model, input_spec=input_spec)
+ paddle.jit.save(
+ model,
+ osp.join(args.output_path,
+ model_name if args.save_name is None else args.save_name))
+ print(
+ f"model ({model_name}) has been already saved in ({args.output_path}).")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/paddlevideo_clas.py b/tools/paddlevideo_clas.py
new file mode 100644
index 0000000000000000000000000000000000000000..4843e62f8b8e31ad6d6678cb9b1bb24c2e23e718
--- /dev/null
+++ b/tools/paddlevideo_clas.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+__dir__ = os.path.dirname(__file__)
+sys.path.append(os.path.join(__dir__, ''))
+
+
+import numpy as np
+import tarfile
+import requests
+from tqdm import tqdm
+from tools import utils
+import shutil
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+__all__ = ['PaddleVideo']
+BASE_DIR = os.path.expanduser("~/.paddlevideo_inference/")
+BASE_INFERENCE_MODEL_DIR = os.path.join(BASE_DIR, 'inference_model')
+BASE_VIDEOS_DIR = os.path.join(BASE_DIR, 'videos')
+
+model_names = {'ppTSM','TSM','TSN'}
+
+
+def create_paddle_predictor(args):
+ config = Config(args.model_file, args.params_file)
+
+ if args.use_gpu:
+ config.enable_use_gpu(args.gpu_mem, 0)
+ else:
+ config.disable_gpu()
+ if args.enable_mkldnn:
+ # cache 10 different shapes for mkldnn to avoid memory leak
+ config.set_mkldnn_cache_capacity(10)
+ config.enable_mkldnn()
+
+ config.disable_glog_info()
+ config.switch_ir_optim(args.ir_optim) # default true
+ if args.use_tensorrt:
+ config.enable_tensorrt_engine(
+ precision_mode=Config.Precision.Half
+ if args.use_fp16 else Config.Precision.Float32,
+ max_batch_size=args.batch_size)
+
+ config.enable_memory_optim()
+ # use zero copy
+ config.switch_use_feed_fetch_ops(False)
+ predictor = create_predictor(config)
+
+ return predictor
+
+def download_with_progressbar(url, save_path):
+ response = requests.get(url, stream=True)
+ total_size_in_bytes = int(response.headers.get('content-length', 0))
+ block_size = 1024 # 1 Kibibyte
+ progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+ with open(save_path, 'wb') as file:
+ for data in response.iter_content(block_size):
+ progress_bar.update(len(data))
+ file.write(data)
+ progress_bar.close()
+ if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes:
+ raise Exception("Something went wrong while downloading models")
+
+def maybe_download(model_storage_directory, url):
+ # using custom model
+ tar_file_name_list = [
+ 'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel'
+ ]
+ if not os.path.exists(
+ os.path.join(model_storage_directory, 'inference.pdiparams')
+ ) or not os.path.exists(
+ os.path.join(model_storage_directory, 'inference.pdmodel')):
+ tmp_path = os.path.join(model_storage_directory, url.split('/')[-1])
+ print('download {} to {}'.format(url, tmp_path))
+ os.makedirs(model_storage_directory, exist_ok=True)
+ download_with_progressbar(url, tmp_path) #download
+
+ #save to directory
+ with tarfile.open(tmp_path, 'r') as tarObj:
+ for member in tarObj.getmembers():
+ filename = None
+ for tar_file_name in tar_file_name_list:
+ if tar_file_name in member.name:
+ filename = tar_file_name
+ if filename is None:
+ continue
+ file = tarObj.extractfile(member)
+ with open(
+ os.path.join(model_storage_directory, filename),
+ 'wb') as f:
+ f.write(file.read())
+ os.remove(tmp_path)
+
+def load_label_name_dict(path):
+ result = {}
+ if not os.path.exists(path):
+ print(
+ 'Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!'
+ )
+ else:
+ for line in open(path, 'r'):
+ partition = line.split('\n')[0].partition(' ')
+ try:
+ result[int(partition[0])] = str(partition[-1])
+ except:
+ result = {}
+ break
+ return result
+
+def parse_args(mMain=True, add_help=True):
+ import argparse
+
+ def str2bool(v):
+ return v.lower() in ("true", "t", "1")
+
+ if mMain == True:
+
+ # general params
+ parser = argparse.ArgumentParser(add_help=add_help)
+ parser.add_argument("--model_name", type=str,default='')
+ parser.add_argument("-v", "--video_file", type=str,default='')
+ parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+ # params for decode and sample
+ parser.add_argument("--num_seg", type=int, default=8)
+ parser.add_argument("--seg_len", type=int, default=1)
+
+ # params for preprocess
+ parser.add_argument("--short_size", type=int, default=256)
+ parser.add_argument("--target_size", type=int, default=224)
+ parser.add_argument("--normalize", type=str2bool, default=True)
+
+ # params for predict
+ parser.add_argument("--model_file", type=str,default='')
+ parser.add_argument("--params_file", type=str)
+ parser.add_argument("-b", "--batch_size", type=int, default=1)
+ parser.add_argument("--use_fp16", type=str2bool, default=False)
+ parser.add_argument("--ir_optim", type=str2bool, default=True)
+ parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+ parser.add_argument("--gpu_mem", type=int, default=8000)
+ parser.add_argument("--top_k", type=int, default=1)
+ parser.add_argument("--enable_mkldnn", type=bool, default=False)
+ parser.add_argument("--label_name_path",type=str,default='')
+
+ return parser.parse_args()
+
+ else:
+ return argparse.Namespace(
+ model_name='',
+ video_file='',
+ use_gpu=False,
+ num_seg=8,
+ seg_len=1,
+ short_size=256,
+ target_size=224,
+ normalize=True,
+ model_file='',
+ params_file='',
+ batch_size=1,
+ use_fp16=False,
+ ir_optim=True,
+ use_tensorrt=False,
+ gpu_mem=8000,
+ top_k=1,
+ enable_mkldnn=False,
+ label_name_path='')
+
+def get_video_list(video_file):
+ videos_lists = []
+ if video_file is None or not os.path.exists(video_file):
+ raise Exception("not found any video file in {}".format(video_file))
+
+ video_end = ['mp4','avi']
+ if os.path.isfile(video_file) and video_file.split('.')[-1] in video_end:
+ videos_lists.append(video_file)
+ elif os.path.isdir(video_file):
+ for single_file in os.listdir(video_file):
+ if single_file.split('.')[-1] in video_end:
+ videos_lists.append(os.path.join(video_file, single_file))
+ if len(videos_lists) == 0:
+ raise Exception("not found any video file in {}".format(video_file))
+ return videos_lists
+
+class PaddleVideo(object):
+ print('Inference models that Paddle provides are listed as follows:\n\n{}'.
+ format(model_names), '\n')
+
+ def __init__(self, **kwargs):
+ process_params = parse_args(mMain=False,add_help=False)
+ process_params.__dict__.update(**kwargs)
+
+ if not os.path.exists(process_params.model_file):
+ if process_params.model_name is None:
+ raise Exception(
+ 'Please input model name that you want to use!')
+ if process_params.model_name in model_names:
+ url = 'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/{}_infer.tar'.format(process_params.model_name)
+ if not os.path.exists(
+ os.path.join(BASE_INFERENCE_MODEL_DIR,
+ process_params.model_name)):
+ os.makedirs(
+ os.path.join(BASE_INFERENCE_MODEL_DIR,
+ process_params.model_name))
+ #create pretrained model download_path
+ download_path = os.path.join(BASE_INFERENCE_MODEL_DIR,
+ process_params.model_name)
+ maybe_download(model_storage_directory=download_path, url=url)
+ process_params.model_file = os.path.join(download_path,
+ 'inference.pdmodel')
+ process_params.params_file = os.path.join(
+ download_path, 'inference.pdiparams')
+ process_params.label_name_path = os.path.join(
+ __dir__, '../data/k400/Kinetics-400_label_list.txt')
+ else:
+ raise Exception(
+ 'If you want to use your own model, Please input model_file as model path!'
+ )
+ else:
+ print('Using user-specified model and params!')
+ print("process params are as follows: \n{}".format(process_params))
+ self.label_name_dict = load_label_name_dict(
+ process_params.label_name_path)
+
+ self.args = process_params
+ self.predictor = create_paddle_predictor(process_params)
+
+ def predict(self,video):
+ """
+ predict label of video with paddlevideo_clas
+ Args:
+ video:input video for clas, support single video , internet url, folder path containing series of videos
+ Returns:
+ list[dict:{videoname: "",class_ids: [], scores: [], label_names: []}],if label name path is None,label names will be empty
+ """
+ video_list = []
+ assert isinstance(video, (str, np.ndarray))
+
+ input_names = self.predictor.get_input_names()
+ input_tensor = self.predictor.get_input_handle(input_names[0])
+
+ output_names = self.predictor.get_output_names()
+ output_tensor = self.predictor.get_output_handle(output_names[0])
+
+ if isinstance(video, str):
+ # download internet video,
+ if video.startswith('http'):
+ if not os.path.exists(BASE_VIDEOS_DIR):
+ os.makedirs(BASE_VIDEOS_DIR)
+ video_path = os.path.join(BASE_VIDEOS_DIR, 'tmp.mp4')
+ download_with_progressbar(video, video_path)
+ print("Current using video from Internet:{}, renamed as: {}".
+ format(video, video_path))
+ video = video_path
+ video_list = get_video_list(video)
+ else:
+ if isinstance(video, np.ndarray):
+ video_list = [video]
+ else:
+ print('Please input legal video!')
+
+ total_result = []
+ for filename in video_list:
+ if isinstance(filename, str):
+ v = utils.decode(filename, self.args)
+ assert v is not None, "Error in loading video: {}".format(
+ filename)
+ inputs = utils.preprocess(v, self.args)
+ inputs = np.expand_dims(
+ inputs, axis=0).repeat(
+ 1, axis=0).copy()
+ else:
+ inputs = filename
+
+ input_tensor.copy_from_cpu(inputs)
+
+ self.predictor.run()
+
+ outputs = output_tensor.copy_to_cpu()
+ classes, scores = utils.postprocess(outputs, self.args)
+ label_names = []
+ if len(self.label_name_dict) != 0:
+ label_names = [self.label_name_dict[c] for c in classes]
+ result = {
+ "videoname": filename if isinstance(filename, str) else 'video',
+ "class_ids": classes.tolist(),
+ "scores": scores.tolist(),
+ "label_names": label_names,
+ }
+ total_result.append(result)
+ return total_result
+
+def main():
+ # for cmd
+ args = parse_args(mMain=True)
+ clas_engine = PaddleVideo(**(args.__dict__))
+ print('{}{}{}'.format('*' * 10, args.video_file, '*' * 10))
+ result = clas_engine.predict(args.video_file)
+ if result is not None:
+ print(result)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/predict.py b/tools/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab30fd025608bfe68422be34dd262cc6509aebe
--- /dev/null
+++ b/tools/predict.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from os import path as osp
+import paddle
+from paddle import inference
+from paddle.inference import Config, create_predictor
+
+from utils import build_inference_helper
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+ def str2bool(v):
+ return v.lower() in ("true", "t", "1")
+
+ # general params
+ parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+ parser.add_argument('-c',
+ '--config',
+ type=str,
+ default='configs/example.yaml',
+ help='config file path')
+ parser.add_argument("-i", "--input_file", type=str, help="input file path")
+ parser.add_argument("--model_file", type=str)
+ parser.add_argument("--params_file", type=str)
+
+ # params for paddle predict
+ parser.add_argument("-b", "--batch_size", type=int, default=1)
+ parser.add_argument("--use_gpu", type=str2bool, default=True)
+ parser.add_argument("--precision", type=str, default="fp32")
+ parser.add_argument("--ir_optim", type=str2bool, default=True)
+ parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+ parser.add_argument("--gpu_mem", type=int, default=8000)
+ parser.add_argument("--enable_benchmark", type=str2bool, default=False)
+ parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+ parser.add_argument("--cpu_threads", type=int, default=None)
+ # parser.add_argument("--hubserving", type=str2bool, default=False) #TODO
+
+ return parser.parse_args()
+
+
+def create_paddle_predictor(args, cfg):
+ config = Config(args.model_file, args.params_file)
+ if args.use_gpu:
+ config.enable_use_gpu(args.gpu_mem, 0)
+ else:
+ config.disable_gpu()
+ if args.cpu_threads:
+ config.set_cpu_math_library_num_threads(args.cpu_threads)
+ if args.enable_mkldnn:
+ # cache 10 different shapes for mkldnn to avoid memory leak
+ config.set_mkldnn_cache_capacity(10)
+ config.enable_mkldnn()
+ if args.precision == "fp16":
+ config.enable_mkldnn_bfloat16()
+
+ # config.disable_glog_info()
+ config.switch_ir_optim(args.ir_optim) # default true
+ if args.use_tensorrt:
+ # choose precision
+ if args.precision == "fp16":
+ precision = inference.PrecisionType.Half
+ elif args.precision == "int8":
+ precision = inference.PrecisionType.Int8
+ else:
+ precision = inference.PrecisionType.Float32
+
+ # calculate real max batch size during inference when tenrotRT enabled
+ max_batch_size = args.batch_size
+ if 'num_seg' in cfg.INFERENCE:
+ # num_seg: number of segments when extracting frames.
+ # seg_len: number of frames extracted within a segment, default to 1.
+ # num_views: the number of video frame groups obtained by cropping and flipping,
+ # uniformcrop=3, tencrop=10, centercrop=1.
+ num_seg = cfg.INFERENCE.num_seg
+ seg_len = cfg.INFERENCE.get('seg_len', 1)
+ num_views = 1
+ if 'tsm' in cfg.model_name.lower():
+ num_views = 1 # CenterCrop
+ elif 'tsn' in cfg.model_name.lower():
+ num_views = 10 # TenCrop
+ elif 'timesformer' in cfg.model_name.lower():
+ num_views = 3 # UniformCrop
+ elif 'videoswin' in cfg.model_name.lower():
+ num_views = 3 # UniformCrop
+ max_batch_size = args.batch_size * num_views * num_seg * seg_len
+ config.enable_tensorrt_engine(precision_mode=precision,
+ max_batch_size=max_batch_size)
+
+ config.enable_memory_optim()
+ # use zero copy
+ config.switch_use_feed_fetch_ops(False)
+
+ # for ST-GCN tensorRT case usage
+ # config.delete_pass("shuffle_channel_detect_pass")
+
+ predictor = create_predictor(config)
+
+ return config, predictor
+
+
+def parse_file_paths(input_path: str) -> list:
+ if osp.isfile(input_path):
+ files = [
+ input_path,
+ ]
+ else:
+ files = os.listdir(input_path)
+ files = [
+ file for file in files
+ if (file.endswith(".avi") or file.endswith(".mp4"))
+ ]
+ files = [osp.join(input_path, file) for file in files]
+ return files
+
+
+def main():
+ """predict using paddle inference model
+ """
+ args = parse_args()
+ cfg = get_config(args.config, show=False)
+
+ model_name = cfg.model_name
+ print(f"Inference model({model_name})...")
+ InferenceHelper = build_inference_helper(cfg.INFERENCE)
+
+ inference_config, predictor = create_paddle_predictor(args, cfg)
+
+ # get input_tensor and output_tensor
+ input_names = predictor.get_input_names()
+ output_names = predictor.get_output_names()
+ input_tensor_list = []
+ output_tensor_list = []
+ for item in input_names:
+ input_tensor_list.append(predictor.get_input_handle(item))
+ for item in output_names:
+ output_tensor_list.append(predictor.get_output_handle(item))
+
+ # get the absolute file path(s) to be processed
+ if model_name in ["MSTCN", "ASRF"]:
+ files = InferenceHelper.get_process_file(args.input_file)
+ else:
+ files = parse_file_paths(args.input_file)
+
+ if model_name == 'TransNetV2':
+ for file in files:
+ inputs = InferenceHelper.preprocess(file)
+ outputs = []
+ for input in inputs:
+ # Run inference
+ for i in range(len(input_tensor_list)):
+ input_tensor_list[i].copy_from_cpu(input)
+ predictor.run()
+ output = []
+ for j in range(len(output_tensor_list)):
+ output.append(output_tensor_list[j].copy_to_cpu())
+ outputs.append(output)
+
+ # Post process output
+ InferenceHelper.postprocess(outputs)
+
+ elif model_name == 'AVA_SlowFast_FastRcnn':
+ for file in files: # for videos
+ inputs = InferenceHelper.preprocess(file)
+ outputs = []
+ for input in inputs:
+ # Run inference
+ input_len = len(input_tensor_list)
+
+ for i in range(input_len):
+ if type(input[i]) == paddle.Tensor:
+ input_tmp = input[i].numpy()
+ else:
+ input_tmp = input[i]
+ input_tensor_list[i].copy_from_cpu(input_tmp)
+ predictor.run()
+ output = []
+ for j in range(len(output_tensor_list)):
+ output.append(output_tensor_list[j].copy_to_cpu())
+ outputs.append(output)
+
+ # Post process output
+ InferenceHelper.postprocess(outputs)
+ else:
+ if args.enable_benchmark:
+ test_video_num = 12
+ num_warmup = 3
+
+ # instantiate auto log
+ try:
+ import auto_log
+ except ImportError as e:
+ print(f"{e}, [git+https://github.com/LDOUBLEV/AutoLog] "
+ f"package and it's dependencies is required for "
+ f"python-inference when enable_benchmark=True.")
+ pid = os.getpid()
+ autolog = auto_log.AutoLogger(model_name=cfg.model_name,
+ model_precision=args.precision,
+ batch_size=args.batch_size,
+ data_shape="dynamic",
+ save_path="./output/auto_log.lpg",
+ inference_config=inference_config,
+ pids=pid,
+ process_name=None,
+ gpu_ids=0 if args.use_gpu else None,
+ time_keys=[
+ 'preprocess_time',
+ 'inference_time',
+ 'postprocess_time'
+ ],
+ warmup=num_warmup)
+ files = [
+ args.input_file for _ in range(test_video_num + num_warmup)
+ ]
+
+ # Inferencing process
+ batch_num = args.batch_size
+ for st_idx in range(0, len(files), batch_num):
+ ed_idx = min(st_idx + batch_num, len(files))
+
+ # auto log start
+ if args.enable_benchmark:
+ autolog.times.start()
+
+ # Pre process batched input
+ batched_inputs = InferenceHelper.preprocess_batch(
+ files[st_idx:ed_idx])
+
+ # get pre process time cost
+ if args.enable_benchmark:
+ autolog.times.stamp()
+
+ # run inference
+ for i in range(len(input_tensor_list)):
+ input_tensor_list[i].copy_from_cpu(batched_inputs[i])
+ predictor.run()
+
+ batched_outputs = []
+ for j in range(len(output_tensor_list)):
+ batched_outputs.append(output_tensor_list[j].copy_to_cpu())
+
+ # get inference process time cost
+ if args.enable_benchmark:
+ autolog.times.stamp()
+
+ InferenceHelper.postprocess(batched_outputs,
+ not args.enable_benchmark)
+
+ # get post process time cost
+ if args.enable_benchmark:
+ autolog.times.end(stamp=True)
+
+ # time.sleep(0.01) # sleep for T4 GPU
+
+ # report benchmark log if enabled
+ if args.enable_benchmark:
+ autolog.report()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/summary.py b/tools/summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..28bd6f7ebb945acfdb9be3c8c5a9ce62b34156c9
--- /dev/null
+++ b/tools/summary.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+import os.path as osp
+
+import paddle
+import paddle.nn.functional as F
+from paddle.jit import to_static
+import paddleslim
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+
+ parser = argparse.ArgumentParser("PaddleVideo Summary")
+ parser.add_argument('-c',
+ '--config',
+ type=str,
+ default='configs/example.yaml',
+ help='config file path')
+
+ parser.add_argument("--img_size", type=int, default=224)
+ parser.add_argument("--num_seg", type=int, default=8)
+ parser.add_argument("--FLOPs",
+ action="store_true",
+ help="whether to print FLOPs")
+
+ return parser.parse_args()
+
+
+def _trim(cfg, args):
+ """
+ Reuse the trainging config will bring useless attribute, such as: backbone.pretrained model. Trim it here.
+ """
+ model_name = cfg.model_name
+ cfg = cfg.MODEL
+ cfg.backbone.pretrained = ""
+
+ if 'num_seg' in cfg.backbone:
+ cfg.backbone.num_seg = args.num_seg
+ return cfg, model_name
+
+
+def main():
+ args = parse_args()
+ cfg, model_name = _trim(get_config(args.config, show=False), args)
+ print(f"Building model({model_name})...")
+ model = build_model(cfg)
+
+ img_size = args.img_size
+ num_seg = args.num_seg
+ #NOTE: only support tsm now, will refine soon
+ params_info = paddle.summary(model, (1, 1, num_seg, 3, img_size, img_size))
+ print(params_info)
+
+ if args.FLOPs:
+ flops_info = paddleslim.analysis.flops(
+ model, [1, 1, num_seg, 3, img_size, img_size])
+ print(flops_info)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/utils.py b/tools/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59cb96a4e690258a0227388139d68b187cfd2b6
--- /dev/null
+++ b/tools/utils.py
@@ -0,0 +1,1473 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import sys
+from typing import List
+
+import cv2
+try:
+ import imageio
+except ImportError as e:
+ print(
+ f"{e}, [imageio] package and it's dependencies is required for VideoSwin."
+ )
+try:
+ import matplotlib as mpl
+ import matplotlib.cm as cm
+except ImportError as e:
+ print(
+ f"{e}, [matplotlib] package and it's dependencies is required for ADDS."
+ )
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import pandas
+from PIL import Image
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+from abc import abstractmethod
+
+from paddlevideo.loader.builder import build_pipeline
+from paddlevideo.loader.pipelines import (
+ AutoPadding, CenterCrop, DecodeSampler, FeatureDecoder, FrameDecoder,
+ GroupResize, Image2Array, ImageDecoder, JitterScale, MultiCrop,
+ Normalization, PackOutput, Sampler, SamplerPkl, Scale, SkeletonNorm,
+ TenCrop, ToArray, UniformCrop, VideoDecoder, SegmentationSampler,
+ SketeonCropSample)
+from paddlevideo.metrics.ava_utils import read_labelmap
+from paddlevideo.metrics.bmn_metric import boundary_choose, soft_nms
+from paddlevideo.utils import Registry, build, get_config
+from paddlevideo.modeling.framework.segmenters.utils import ASRFPostProcessing
+
+from ava_predict import (detection_inference, frame_extraction,
+ get_detection_result, get_timestep_result, pack_result,
+ visualize)
+
+INFERENCE = Registry('inference')
+
+
+def decode(filepath, args):
+ num_seg = args.num_seg
+ seg_len = args.seg_len
+
+ cap = cv2.VideoCapture(filepath)
+ videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+ sampledFrames = []
+ for i in range(videolen):
+ ret, frame = cap.read()
+ # maybe first frame is empty
+ if ret == False:
+ continue
+ img = frame[:, :, ::-1]
+ sampledFrames.append(img)
+ average_dur = int(len(sampledFrames) / num_seg)
+ imgs = []
+ for i in range(num_seg):
+ idx = 0
+ if average_dur >= seg_len:
+ idx = (average_dur - 1) // 2
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
+ else:
+ idx = i
+
+ for jj in range(idx, idx + seg_len):
+ imgbuf = sampledFrames[int(jj % len(sampledFrames))]
+ img = Image.fromarray(imgbuf, mode='RGB')
+ imgs.append(img)
+
+ return imgs
+
+
+def preprocess(img, args):
+ img = {"imgs": img}
+ resize_op = Scale(short_size=args.short_size)
+ img = resize_op(img)
+ ccrop_op = CenterCrop(target_size=args.target_size)
+ img = ccrop_op(img)
+ to_array = Image2Array()
+ img = to_array(img)
+ if args.normalize:
+ img_mean = [0.485, 0.456, 0.406]
+ img_std = [0.229, 0.224, 0.225]
+ normalize_op = Normalization(mean=img_mean, std=img_std)
+ img = normalize_op(img)
+ return img['imgs']
+
+
+def postprocess(output, args):
+ output = output.flatten()
+ output = F.softmax(paddle.to_tensor(output)).numpy()
+ classes = np.argpartition(output, -args.top_k)[-args.top_k:]
+ classes = classes[np.argsort(-output[classes])]
+ scores = output[classes]
+ return classes, scores
+
+
+def build_inference_helper(cfg):
+ return build(cfg, INFERENCE)
+
+
+class Base_Inference_helper():
+ def __init__(self,
+ num_seg=8,
+ seg_len=1,
+ short_size=256,
+ target_size=224,
+ top_k=1):
+ """Base_Inference_helper
+
+ Args:
+ num_seg (int, optional): number of segmentations of an sliced input video. Defaults to 8.
+ seg_len (int, optional): length of each segmentation. Defaults to 1.
+ short_size (int, optional): short size of input video. Defaults to 256.
+ target_size (int, optional): size of cropped video. Defaults to 224.
+ top_k (int, optional): select topk result in outputs. Defaults to 1.
+ """
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.short_size = short_size
+ self.target_size = target_size
+ self.top_k = top_k
+
+ @abstractmethod
+ def preprocess(self, input_file: str):
+ """preprocess abstractmethod
+
+ Args:
+ input_file (str): input file path.
+ """
+ pass
+
+ def preprocess_batch(self, file_list: List[str]) -> List[np.ndarray]:
+ """preprocess for file list
+
+ Args:
+ file_list (List[str]): file pathes in an list, [path1, path2, ...].
+
+ Returns:
+ List[np.ndarray]: batched inputs data, [data_batch[0], data_batch[1], ...].
+ """
+ batched_inputs = []
+ for file in file_list:
+ inputs = self.preprocess(file)
+ batched_inputs.append(inputs)
+ batched_inputs = [
+ np.concatenate([item[i] for item in batched_inputs])
+ for i in range(len(batched_inputs[0]))
+ ]
+ self.input_file = file_list
+ return batched_inputs
+
+ def postprocess(self,
+ output: np.ndarray,
+ print_output: bool = True) -> None:
+ """postprocess
+
+ Args:
+ output (np.ndarray): batched output scores, shape of (batch_size, class_num).
+ print_output (bool, optional): whether to print result. Defaults to True.
+ """
+ if not isinstance(self.input_file, list):
+ self.input_file = [
+ self.input_file,
+ ]
+ output = output[0] # [B, num_cls]
+ N = len(self.input_file)
+ if output.shape[0] != N:
+ output = output.reshape([N] + [output.shape[0] // N] +
+ list(output.shape[1:])) # [N, T, C]
+ output = output.mean(axis=1) # [N, C]
+ output = F.softmax(paddle.to_tensor(output), axis=-1).numpy()
+ for i in range(N):
+ classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+ classes = classes[np.argsort(-output[i, classes])]
+ scores = output[i, classes]
+ if print_output:
+ print("Current video file: {0}".format(self.input_file[i]))
+ for j in range(self.top_k):
+ print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+ print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+
+
+@INFERENCE.register()
+class ppTSM_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_seg=8,
+ seg_len=1,
+ short_size=256,
+ target_size=224,
+ top_k=1):
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.short_size = short_size
+ self.target_size = target_size
+ self.top_k = top_k
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {'filename': input_file}
+ img_mean = [0.485, 0.456, 0.406]
+ img_std = [0.229, 0.224, 0.225]
+ ops = [
+ VideoDecoder(),
+ Sampler(self.num_seg, self.seg_len, valid_mode=True),
+ Scale(self.short_size),
+ CenterCrop(self.target_size),
+ Image2Array(),
+ Normalization(img_mean, img_std)
+ ]
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['imgs'], axis=0).copy()
+ return [res]
+
+
+@INFERENCE.register()
+class ppTSN_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_seg=25,
+ seg_len=1,
+ short_size=256,
+ target_size=224,
+ top_k=1):
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.short_size = short_size
+ self.target_size = target_size
+ self.top_k = top_k
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {'filename': input_file}
+ img_mean = [0.485, 0.456, 0.406]
+ img_std = [0.229, 0.224, 0.225]
+ ops = [
+ VideoDecoder(),
+ Sampler(self.num_seg,
+ self.seg_len,
+ valid_mode=True,
+ select_left=True),
+ Scale(self.short_size,
+ fixed_ratio=True,
+ do_round=True,
+ backend='cv2'),
+ TenCrop(self.target_size),
+ Image2Array(),
+ Normalization(img_mean, img_std)
+ ]
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['imgs'], axis=0).copy()
+ return [res]
+
+
+@INFERENCE.register()
+class BMN_Inference_helper(Base_Inference_helper):
+ def __init__(self, feat_dim, dscale, tscale, result_path):
+ self.feat_dim = feat_dim
+ self.dscale = dscale
+ self.tscale = tscale
+ self.result_path = result_path
+ if not os.path.isdir(self.result_path):
+ os.makedirs(self.result_path)
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ file_info = json.load(open(input_file))
+ self.feat_path = file_info['feat_path']
+ self.video_duration = file_info['duration_second']
+ feat = np.load(self.feat_path).astype('float32').T
+ res = np.expand_dims(feat, axis=0).copy()
+
+ return [res]
+
+ def postprocess(self, outputs, print_output=True):
+ """
+ output: list
+ """
+ pred_bm, pred_start, pred_end = outputs
+ self._gen_props(pred_bm, pred_start[0], pred_end[0], print_output)
+
+ def _gen_props(self, pred_bm, pred_start, pred_end, print_output):
+ snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]
+ snippet_xmaxs = [
+ 1.0 / self.tscale * i for i in range(1, self.tscale + 1)
+ ]
+
+ pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]
+ start_mask = boundary_choose(pred_start)
+ start_mask[0] = 1.
+ end_mask = boundary_choose(pred_end)
+ end_mask[-1] = 1.
+ score_vector_list = []
+ for idx in range(self.dscale):
+ for jdx in range(self.tscale):
+ start_index = jdx
+ end_index = start_index + idx
+ if end_index < self.tscale and start_mask[
+ start_index] == 1 and end_mask[end_index] == 1:
+ xmin = snippet_xmins[start_index]
+ xmax = snippet_xmaxs[end_index]
+ xmin_score = pred_start[start_index]
+ xmax_score = pred_end[end_index]
+ bm_score = pred_bm[idx, jdx]
+ conf_score = xmin_score * xmax_score * bm_score
+ score_vector_list.append([xmin, xmax, conf_score])
+
+ cols = ["xmin", "xmax", "score"]
+ score_vector_list = np.stack(score_vector_list)
+ df = pandas.DataFrame(score_vector_list, columns=cols)
+
+ result_dict = {}
+ proposal_list = []
+ df = soft_nms(df, alpha=0.4, t1=0.55, t2=0.9)
+ for idx in range(min(100, len(df))):
+ tmp_prop={"score":df.score.values[idx], \
+ "segment":[max(0,df.xmin.values[idx])*self.video_duration, \
+ min(1,df.xmax.values[idx])*self.video_duration]}
+ proposal_list.append(tmp_prop)
+
+ result_dict[self.feat_path] = proposal_list
+
+ # print top-5 predictions
+ if print_output:
+ print("Current video file: {0} :".format(self.feat_path))
+ for pred in proposal_list[:5]:
+ print(pred)
+
+ # save result
+ outfile = open(
+ os.path.join(self.result_path, "bmn_results_inference.json"), "w")
+
+ json.dump(result_dict, outfile)
+
+
+@INFERENCE.register()
+class TimeSformer_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_seg=8,
+ seg_len=1,
+ short_size=224,
+ target_size=224,
+ top_k=1,
+ mean=[0.45, 0.45, 0.45],
+ std=[0.225, 0.225, 0.225]):
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.short_size = short_size
+ self.target_size = target_size
+ self.top_k = top_k
+ self.mean = mean
+ self.std = std
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {'filename': input_file}
+ ops = [
+ VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),
+ Sampler(self.num_seg,
+ self.seg_len,
+ valid_mode=True,
+ linspace_sample=True),
+ Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),
+ Image2Array(data_format='cthw'),
+ JitterScale(self.short_size, self.short_size),
+ UniformCrop(self.target_size)
+ ]
+ for op in ops:
+ results = op(results)
+
+ # [N,C,Tx3,H,W]
+ res = np.expand_dims(results['imgs'], axis=0).copy()
+ return [res]
+
+
+@INFERENCE.register()
+class VideoSwin_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_seg=4,
+ seg_len=32,
+ frame_interval=2,
+ short_size=224,
+ target_size=224,
+ top_k=1,
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375]):
+
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.frame_interval = frame_interval
+ self.short_size = short_size
+ self.target_size = target_size
+ self.top_k = top_k
+ self.mean = mean
+ self.std = std
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ self.input_file = input_file
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {'filename': input_file}
+ ops = [
+ VideoDecoder(backend='decord', mode='valid'),
+ Sampler(num_seg=self.num_seg,
+ frame_interval=self.frame_interval,
+ seg_len=self.seg_len,
+ valid_mode=True,
+ use_pil=False),
+ Scale(short_size=self.short_size,
+ fixed_ratio=False,
+ keep_ratio=True,
+ backend='cv2',
+ do_round=True),
+ CenterCrop(target_size=224, backend='cv2'),
+ Normalization(mean=self.mean,
+ std=self.std,
+ tensor_shape=[3, 1, 1, 1],
+ inplace=True),
+ Image2Array(data_format='cthw')
+ ]
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['imgs'], axis=0).copy()
+ return [res]
+
+ def postprocess(self, output, print_output=True):
+ """
+ output: list
+ """
+ if not isinstance(self.input_file, list):
+ self.input_file = [
+ self.input_file,
+ ]
+ output = output[0] # [B, num_cls]
+ N = len(self.input_file)
+ if output.shape[0] != N:
+ output = output.reshape([N] + [output.shape[0] // N] +
+ list(output.shape[1:])) # [N, T, C]
+ output = output.mean(axis=1) # [N, C]
+ for i in range(N):
+ classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+ classes = classes[np.argsort(-output[i, classes])]
+ scores = output[i, classes]
+ if print_output:
+ print("Current video file: {0}".format(self.input_file[i]))
+ for j in range(self.top_k):
+ print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+ print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+
+
+@INFERENCE.register()
+class VideoSwin_TableTennis_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_seg=1,
+ seg_len=32,
+ short_size=256,
+ target_size=224,
+ top_k=1):
+ self.num_seg = num_seg
+ self.seg_len = seg_len
+ self.short_size = short_size
+ self.target_size = target_size
+ self.top_k = top_k
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {'frame_dir': input_file, 'suffix': 'img_{:05}.jpg'}
+ img_mean = [123.675, 116.28, 103.53]
+ img_std = [58.395, 57.12, 57.375]
+ ops = [
+ FrameDecoder(),
+ SamplerPkl(num_seg=self.num_seg,
+ seg_len=self.seg_len,
+ backend='cv2',
+ valid_mode=True),
+ Scale(short_size=self.short_size,
+ fixed_ratio=False,
+ keep_ratio=True,
+ backend='cv2',
+ do_round=True),
+ UniformCrop(target_size=self.target_size, backend='cv2'),
+ Normalization(mean=img_mean,
+ std=img_std,
+ tensor_shape=[3, 1, 1, 1],
+ inplace=True),
+ Image2Array(data_format='cthw')
+ ]
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['imgs'], axis=0).copy()
+ return [res]
+
+ def add_text_to_video(
+ self,
+ video_path,
+ output_dir="applications/TableTennis/ActionRecognition/results",
+ text=None):
+ os.makedirs(output_dir, exist_ok=True)
+ if video_path.endswith('.pkl'):
+ try:
+ import cPickle as pickle
+ from cStringIO import StringIO
+ except ImportError:
+ import pickle
+ from io import BytesIO
+ from PIL import Image
+ data_loaded = pickle.load(open(video_path, 'rb'), encoding='bytes')
+ _, _, frames = data_loaded
+ frames_len = len(frames)
+
+ else:
+ videoCapture = cv2.VideoCapture()
+ videoCapture.open(video_path)
+
+ fps = videoCapture.get(cv2.CAP_PROP_FPS)
+ frame_width = int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH))
+ frame_height = int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+ frames_len = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)
+ print("fps=", int(fps), "frames=", int(frames_len), "scale=",
+ f"{frame_height}x{frame_width}")
+
+ frames_rgb_list = []
+ for i in range(int(frames_len)):
+ if video_path.endswith('.pkl'):
+ frame = np.array(
+ Image.open(BytesIO(frames[i])).convert("RGB").resize(
+ (240, 135)))[:, :, ::-1].astype('uint8')
+ else:
+ _, frame = videoCapture.read()
+ frame = cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_COMPLEX,
+ 1.0, (0, 0, 255), 2)
+ frames_rgb_list.append(frame[:, :, ::-1]) # bgr to rgb
+ if not video_path.endswith('.pkl'):
+ videoCapture.release()
+ cv2.destroyAllWindows()
+ output_filename = os.path.basename(video_path)
+ output_filename = output_filename.split('.')[0] + '.gif'
+ imageio.mimsave(f'{output_dir}/{output_filename}',
+ frames_rgb_list,
+ 'GIF',
+ duration=0.00085)
+
+ def postprocess(self, output, print_output=True, save_gif=True):
+ """
+ output: list
+ """
+ if not isinstance(self.input_file, list):
+ self.input_file = [
+ self.input_file,
+ ]
+ output = output[0] # [B, num_cls]
+ N = len(self.input_file)
+ if output.shape[0] != N:
+ output = output.reshape([N] + [output.shape[0] // N] +
+ list(output.shape[1:])) # [N, T, C]
+ output = output.mean(axis=1) # [N, C]
+ for i in range(N):
+ classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+ classes = classes[np.argsort(-output[i, classes])]
+ scores = output[i, classes]
+ if print_output:
+ print("Current video file: {0}".format(self.input_file[i]))
+ for j in range(self.top_k):
+ print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+ print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+ if save_gif:
+ self.add_text_to_video(
+ self.input_file[0],
+ text=f"{str(classes[0])} {float(scores[0]):.5f}")
+
+
+@INFERENCE.register()
+class SlowFast_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_frames=32,
+ sampling_rate=2,
+ target_size=256,
+ alpha=8,
+ top_k=1):
+ self.num_frames = num_frames
+ self.sampling_rate = sampling_rate
+ self.target_size = target_size
+ self.alpha = alpha
+ self.top_k = top_k
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {
+ 'filename': input_file,
+ 'temporal_sample_index': 0,
+ 'spatial_sample_index': 0,
+ 'temporal_num_clips': 1,
+ 'spatial_num_clips': 1
+ }
+ img_mean = [0.45, 0.45, 0.45]
+ img_std = [0.225, 0.225, 0.225]
+ ops = [
+ DecodeSampler(self.num_frames, self.sampling_rate, test_mode=True),
+ JitterScale(self.target_size, self.target_size),
+ MultiCrop(self.target_size),
+ Image2Array(transpose=False),
+ Normalization(img_mean, img_std, tensor_shape=[1, 1, 1, 3]),
+ PackOutput(self.alpha),
+ ]
+ for op in ops:
+ results = op(results)
+
+ res = []
+ for item in results['imgs']:
+ res.append(np.expand_dims(item, axis=0).copy())
+ return res
+
+ def postprocess(self, output, print_output=True):
+ """
+ output: list
+ """
+ if not isinstance(self.input_file, list):
+ self.input_file = [
+ self.input_file,
+ ]
+ output = output[0] # [B, num_cls]
+
+ N = len(self.input_file)
+ if output.shape[0] != N:
+ output = output.reshape([N] + [output.shape[0] // N] +
+ list(output.shape[1:])) # [N, T, C]
+ output = output.mean(axis=1) # [N, C]
+ # output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() # done in it's head
+ for i in range(N):
+ classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+ classes = classes[np.argsort(-output[i, classes])]
+ scores = output[i, classes]
+ if print_output:
+ print("Current video file: {0}".format(self.input_file[i]))
+ for j in range(self.top_k):
+ print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+ print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+
+
+@INFERENCE.register()
+class STGCN_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_channels,
+ window_size,
+ vertex_nums,
+ person_nums,
+ top_k=1):
+ self.num_channels = num_channels
+ self.window_size = window_size
+ self.vertex_nums = vertex_nums
+ self.person_nums = person_nums
+ self.top_k = top_k
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ data = np.load(input_file)
+ results = {'data': data}
+ ops = [AutoPadding(window_size=self.window_size), SkeletonNorm()]
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['data'], axis=0).copy()
+ return [res]
+
+
+@INFERENCE.register()
+class CTRGCN_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_channels=3,
+ vertex_nums=25,
+ person_nums=2,
+ window_size=64,
+ p_interval=[0.95],
+ top_k=1):
+ self.window_size = window_size
+ self.p_interval = p_interval
+ self.num_channels = num_channels
+ self.vertex_nums = vertex_nums
+ self.person_nums = person_nums
+ self.top_k = top_k
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ data = np.load(input_file)
+ results = {'data': data}
+ ops = [
+ SketeonCropSample(window_size=self.window_size,
+ p_interval=self.p_interval)
+ ]
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['data'], axis=0).copy()
+ return [res]
+
+
+@INFERENCE.register()
+class MSTCN_Inference_helper(Base_Inference_helper):
+ def __init__(self, num_channels, actions_map_file_path, feature_path=None):
+ self.num_channels = num_channels
+ file_ptr = open(actions_map_file_path, 'r')
+ actions = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+ self.actions_dict = dict()
+ for a in actions:
+ self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+ self.feature_path = feature_path
+ self.file_name_list = []
+
+ def get_process_file(self, input_file_txt):
+ with open(input_file_txt, 'r') as file_ptr:
+ info = file_ptr.read().split('\n')[:-1]
+
+ files = []
+ for video_name in info:
+ if self.feature_path is not None:
+ file_name = video_name.split('.')[0] + ".npy"
+ input_file = os.path.join(self.feature_path, file_name)
+ else:
+ input_file = video_name
+
+ assert os.path.isfile(
+ input_file) is not None, "{0} not exists".format(input_file)
+ files.append(input_file)
+
+ self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
+ return files
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, feature file list txt path
+ return: list
+ """
+ output_list = []
+
+ data = np.load(input_file)
+ results = {'video_feat': data, 'video_gt': None}
+ ops = []
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['video_feat'], axis=0).copy()
+ output_list.append(res)
+ return output_list
+
+ def postprocess(self, output, print_output=True):
+ reslut_path = os.path.join("./inference/infer_results/")
+ if not os.path.isdir(reslut_path):
+ os.makedirs(reslut_path)
+ output = [output]
+ for outputs in output:
+ output_np = outputs[0]
+ recognition = []
+ for i in range(output_np.shape[0]):
+ recognition = np.concatenate((recognition, [
+ list(self.actions_dict.keys())[list(
+ self.actions_dict.values()).index(output_np[i])]
+ ]))
+ recog_content = list(recognition)
+ recog_content = [line + "\n" for line in recog_content]
+
+ filename = self.file_name_list.pop(0)
+
+ write_path = os.path.join(reslut_path, filename + ".txt")
+ f = open(write_path, "w")
+ f.writelines(recog_content)
+ f.close()
+ print("result write in : " + write_path)
+
+
+@INFERENCE.register()
+class ASRF_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ num_channels,
+ actions_map_file_path,
+ postprocessing_method,
+ boundary_threshold,
+ feature_path=None):
+ self.num_channels = num_channels
+ file_ptr = open(actions_map_file_path, 'r')
+ actions = file_ptr.read().split('\n')[:-1]
+ file_ptr.close()
+ self.actions_dict = dict()
+ for a in actions:
+ self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+ self.postprocessing_method = postprocessing_method
+ self.boundary_threshold = boundary_threshold
+ self.feature_path = feature_path
+ self.file_name_list = []
+
+ def get_process_file(self, input_file_txt):
+ with open(input_file_txt, 'r') as file_ptr:
+ info = file_ptr.read().split('\n')[:-1]
+
+ files = []
+ for video_name in info:
+ if self.feature_path is not None:
+ file_name = video_name.split('.')[0] + ".npy"
+ input_file = os.path.join(self.feature_path, file_name)
+ else:
+ input_file = video_name
+
+ assert os.path.isfile(
+ input_file) is not None, "{0} not exists".format(input_file)
+ files.append(input_file)
+
+ self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
+ return files
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, feature file list txt path
+ return: list
+ """
+
+ output_list = []
+
+ data = np.load(input_file)
+ results = {'video_feat': data, 'video_gt': None}
+ ops = []
+ for op in ops:
+ results = op(results)
+
+ res = np.expand_dims(results['video_feat'], axis=0).copy()
+ output_list.append(res)
+ return output_list
+
+ def postprocess(self, output, print_output=True):
+ reslut_path = os.path.join("./inference/infer_results/")
+ if not os.path.isdir(reslut_path):
+ os.makedirs(reslut_path)
+ output = [output]
+ for outputs in output:
+ outputs_cls_np = outputs[0]
+ outputs_boundary_np = outputs[1]
+
+ output_np = ASRFPostProcessing(
+ outputs_cls_np,
+ outputs_boundary_np,
+ self.postprocessing_method,
+ boundary_threshold=self.boundary_threshold).numpy()[0, :]
+
+ recognition = []
+ for i in range(output_np.shape[0]):
+ recognition = np.concatenate((recognition, [
+ list(self.actions_dict.keys())[list(
+ self.actions_dict.values()).index(output_np[i])]
+ ]))
+ recog_content = list(recognition)
+ recog_content = [line + "\n" for line in recog_content]
+
+ filename = self.file_name_list.pop(0)
+
+ write_path = os.path.join(reslut_path, filename + ".txt")
+ f = open(write_path, "w")
+ f.writelines(recog_content)
+ f.close()
+ print("result write in : " + write_path)
+
+
+@INFERENCE.register()
+class AttentionLSTM_Inference_helper(Base_Inference_helper):
+ def __init__(
+ self,
+ num_classes, #Optional, the number of classes to be classified.
+ feature_num,
+ feature_dims,
+ embedding_size,
+ lstm_size,
+ top_k=1):
+ self.num_classes = num_classes
+ self.feature_num = feature_num
+ self.feature_dims = feature_dims
+ self.embedding_size = embedding_size
+ self.lstm_size = lstm_size
+ self.top_k = top_k
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {'filename': input_file}
+ ops = [FeatureDecoder(num_classes=self.num_classes, has_label=False)]
+ for op in ops:
+ results = op(results)
+
+ res = []
+ for modality in ['rgb', 'audio']:
+ res.append(
+ np.expand_dims(results[f'{modality}_data'], axis=0).copy())
+ res.append(
+ np.expand_dims(results[f'{modality}_len'], axis=0).copy())
+ res.append(
+ np.expand_dims(results[f'{modality}_mask'], axis=0).copy())
+ return res
+
+
+@INFERENCE.register()
+class TransNetV2_Inference_helper():
+ def __init__(self,
+ num_frames,
+ height,
+ width,
+ num_channels,
+ threshold=0.5,
+ output_path=None,
+ visualize=True):
+ self._input_size = (height, width, num_channels)
+ self.output_path = output_path
+ self.len_frames = 0
+ self.threshold = threshold
+ self.visualize = visualize
+
+ def input_iterator(self, frames):
+ # return windows of size 100 where the first/last 25 frames are from the previous/next batch
+ # the first and last window must be padded by copies of the first and last frame of the video
+ no_padded_frames_start = 25
+ no_padded_frames_end = 25 + 50 - (
+ len(frames) % 50 if len(frames) % 50 != 0 else 50) # 25 - 74
+
+ start_frame = np.expand_dims(frames[0], 0)
+ end_frame = np.expand_dims(frames[-1], 0)
+ padded_inputs = np.concatenate([start_frame] * no_padded_frames_start +
+ [frames] +
+ [end_frame] * no_padded_frames_end, 0)
+
+ ptr = 0
+ while ptr + 100 <= len(padded_inputs):
+ out = padded_inputs[ptr:ptr + 100]
+ out = out.astype(np.float32)
+ ptr += 50
+ yield out[np.newaxis]
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: iterator
+ """
+ try:
+ import ffmpeg
+ except ImportError as e:
+ print(
+ f"{e}, [ffmpeg-python] package and it's dependencies is required for TransNetV2."
+ )
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ self.input_file = input_file
+ self.filename = os.path.splitext(os.path.split(self.input_file)[1])[0]
+ video_stream, err = ffmpeg.input(
+ self.input_file).output("pipe:",
+ format="rawvideo",
+ pix_fmt="rgb24",
+ s="48x27").run(capture_stdout=True,
+ capture_stderr=True)
+ self.frames = np.frombuffer(video_stream,
+ np.uint8).reshape([-1, 27, 48, 3])
+ self.len_frames = len(self.frames)
+
+ return self.input_iterator(self.frames)
+
+ def predictions_to_scenes(self, predictions):
+ predictions = (predictions > self.threshold).astype(np.uint8)
+ scenes = []
+ t, t_prev, start = -1, 0, 0
+ for i, t in enumerate(predictions):
+ if t_prev == 1 and t == 0:
+ start = i
+ if t_prev == 0 and t == 1 and i != 0:
+ scenes.append([start, i])
+ t_prev = t
+ if t == 0:
+ scenes.append([start, i])
+
+ # just fix if all predictions are 1
+ if len(scenes) == 0:
+ return np.array([[0, len(predictions) - 1]], dtype=np.int32)
+
+ return np.array(scenes, dtype=np.int32)
+
+ def visualize_predictions(self, frames, predictions):
+ from PIL import Image, ImageDraw
+
+ if isinstance(predictions, np.ndarray):
+ predictions = [predictions]
+
+ ih, iw, ic = frames.shape[1:]
+ width = 25
+
+ # pad frames so that length of the video is divisible by width
+ # pad frames also by len(predictions) pixels in width in order to show predictions
+ pad_with = width - len(frames) % width if len(
+ frames) % width != 0 else 0
+ frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)),
+ (0, 0)])
+
+ predictions = [np.pad(x, (0, pad_with)) for x in predictions]
+ height = len(frames) // width
+
+ img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])
+ img = np.concatenate(np.split(
+ np.concatenate(np.split(img, height), axis=2)[0], width),
+ axis=2)[0, :-1]
+
+ img = Image.fromarray(img)
+ draw = ImageDraw.Draw(img)
+
+ # iterate over all frames
+ for i, pred in enumerate(zip(*predictions)):
+ x, y = i % width, i // width
+ x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1
+
+ # we can visualize multiple predictions per single frame
+ for j, p in enumerate(pred):
+ color = [0, 0, 0]
+ color[(j + 1) % 3] = 255
+
+ value = round(p * (ih - 1))
+ if value != 0:
+ draw.line((x + j, y, x + j, y - value),
+ fill=tuple(color),
+ width=1)
+ return img
+
+ def postprocess(self, outputs, print_output=True):
+ """
+ output: list
+ """
+ predictions = []
+ for output in outputs:
+ single_frame_logits, all_frames_logits = output
+ single_frame_pred = F.sigmoid(paddle.to_tensor(single_frame_logits))
+ all_frames_pred = F.sigmoid(paddle.to_tensor(all_frames_logits))
+ predictions.append((single_frame_pred.numpy()[0, 25:75, 0],
+ all_frames_pred.numpy()[0, 25:75, 0]))
+ single_frame_pred = np.concatenate(
+ [single_ for single_, all_ in predictions])
+ all_frames_pred = np.concatenate(
+ [all_ for single_, all_ in predictions])
+ single_frame_predictions, all_frame_predictions = single_frame_pred[:
+ self
+ .
+ len_frames], all_frames_pred[:
+ self
+ .
+ len_frames]
+
+ scenes = self.predictions_to_scenes(single_frame_predictions)
+
+ if print_output:
+ print("Current video file: {0}".format(self.input_file))
+ print("\tShot Boundarys: {0}".format(scenes))
+
+ if self.output_path:
+ if not os.path.exists(self.output_path):
+ os.makedirs(self.output_path)
+ predictions = np.stack(
+ [single_frame_predictions, all_frame_predictions], 1)
+ predictions_file = os.path.join(self.output_path,
+ self.filename + "_predictions.txt")
+ np.savetxt(predictions_file, predictions, fmt="%.6f")
+ scenes_file = os.path.join(self.output_path,
+ self.filename + "_scenes.txt")
+ np.savetxt(scenes_file, scenes, fmt="%d")
+
+ if self.visualize:
+ pil_image = self.visualize_predictions(
+ self.frames,
+ predictions=(single_frame_predictions,
+ all_frame_predictions))
+ image_file = os.path.join(self.output_path,
+ self.filename + "_vis.png")
+ pil_image.save(image_file)
+
+
+@INFERENCE.register()
+class ADDS_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ frame_idxs=[0],
+ num_scales=4,
+ side_map={
+ "2": 2,
+ "3": 3,
+ "l": 2,
+ "r": 3
+ },
+ height=256,
+ width=512,
+ full_res_shape=None,
+ num_channels=None,
+ img_ext=".png",
+ K=None):
+
+ self.frame_idxs = frame_idxs
+ self.num_scales = num_scales
+ self.side_map = side_map
+ self.full_res_shape = full_res_shape
+ self.img_ext = img_ext
+ self.height = height
+ self.width = width
+ self.K = K
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ return: list
+ """
+ assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+ input_file)
+ results = {
+ 'filename': input_file,
+ 'mode': 'infer',
+ 'day_or_night': 'day',
+ }
+ ops = [
+ ImageDecoder(
+ backend='pil',
+ dataset='kitti',
+ frame_idxs=self.frame_idxs,
+ num_scales=self.num_scales,
+ side_map=self.side_map,
+ full_res_shape=self.full_res_shape,
+ img_ext=self.img_ext,
+ ),
+ GroupResize(
+ height=self.height,
+ width=self.width,
+ K=self.K,
+ scale=1,
+ mode='infer',
+ ),
+ ToArray(),
+ ]
+ for op in ops:
+ results = op(results)
+ res = results['imgs'][('color', 0, 0)]
+ res = np.expand_dims(res, axis=0).copy()
+ return [res]
+
+ def postprocess(self, output, print_output, save_dir='data/'):
+ """
+ output: list
+ """
+ if not isinstance(self.input_file, list):
+ self.input_file = [
+ self.input_file,
+ ]
+ print(len(output))
+ N = len(self.input_file)
+ for i in range(N):
+ pred_depth = output[i] # [H, W]
+ if print_output:
+ print("Current input image: {0}".format(self.input_file[i]))
+ file_name = os.path.basename(self.input_file[i]).split('.')[0]
+ save_path = os.path.join(save_dir,
+ file_name + "_depth" + ".png")
+ pred_depth_color = self._convertPNG(pred_depth)
+ pred_depth_color.save(save_path)
+ print(f"pred depth image saved to: {save_path}")
+
+ def _convertPNG(self, image_numpy):
+ disp_resized = cv2.resize(image_numpy, (1280, 640))
+ disp_resized_np = disp_resized
+ vmax = np.percentile(disp_resized_np, 95)
+ normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
+ mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
+ colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
+ 255).astype(np.uint8)
+ im = Image.fromarray(colormapped_im)
+ return im
+
+
+@INFERENCE.register()
+class AVA_SlowFast_FastRCNN_Inference_helper(Base_Inference_helper):
+ def __init__(self,
+ detection_model_name,
+ detection_model_weights,
+ config_file_path,
+ predict_stepsize=8,
+ output_stepsize=4,
+ output_fps=6,
+ out_filename='ava_det_demo.mp4',
+ num_frames=32,
+ alpha=4,
+ target_size=256):
+ self.detection_model_name = detection_model_name
+ self.detection_model_weights = detection_model_weights
+
+ self.config = get_config(config_file_path,
+ show=False) #parse config file
+ self.predict_stepsize = predict_stepsize
+ self.output_stepsize = output_stepsize
+ self.output_fps = output_fps
+ self.out_filename = out_filename
+ self.num_frames = num_frames
+ self.alpha = alpha
+ self.target_size = target_size
+
+ def preprocess(self, input_file):
+ """
+ input_file: str, file path
+ """
+
+ frame_dir = 'tmp_frames'
+ self.frame_paths, frames, FPS = frame_extraction(input_file, frame_dir)
+ num_frame = len(self.frame_paths) #视频秒数*FPS
+ assert num_frame != 0
+
+ # 帧图像高度和宽度
+ h, w, _ = frames[0].shape
+
+ # Get clip_len, frame_interval and calculate center index of each clip
+ data_process_pipeline = build_pipeline(
+ self.config.PIPELINE.test) #测试时输出处理流水配置
+
+ clip_len = self.config.PIPELINE.test.sample['clip_len']
+ assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+ frame_interval = self.config.PIPELINE.test.sample['frame_interval']
+
+ # 此处关键帧每秒取一个
+ clip_len = self.config.PIPELINE.test.sample['clip_len']
+ assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+ frame_interval = self.config.PIPELINE.test.sample['frame_interval']
+ window_size = clip_len * frame_interval
+ timestamps = np.arange(window_size // 2,
+ (num_frame + 1 - window_size // 2),
+ self.predict_stepsize)
+
+ selected_frame_list = []
+ for timestamp in timestamps:
+ selected_frame_list.append(self.frame_paths[timestamp - 1])
+
+ # Load label_map
+ label_map_path = self.config.DATASET.test['label_file']
+ self.categories, self.class_whitelist = read_labelmap(
+ open(label_map_path))
+ label_map = {}
+ for item in self.categories:
+ id = item['id']
+ name = item['name']
+ label_map[id] = name
+
+ self.label_map = label_map
+
+ detection_result_dir = 'tmp_detection'
+ detection_model_name = self.detection_model_name
+ detection_model_weights = self.detection_model_weights
+ detection_txt_list = detection_inference(selected_frame_list,
+ detection_result_dir,
+ detection_model_name,
+ detection_model_weights)
+ assert len(detection_txt_list) == len(timestamps)
+
+ human_detections = []
+ data_list = []
+ person_num_list = []
+
+ for timestamp, detection_txt_path in zip(timestamps,
+ detection_txt_list):
+ proposals, scores = get_detection_result(
+ detection_txt_path, h, w,
+ (float)(self.config.DATASET.test['person_det_score_thr']))
+
+ if proposals.shape[0] == 0:
+ #person_num_list.append(0)
+ human_detections.append(None)
+ continue
+
+ human_detections.append(proposals)
+
+ result = get_timestep_result(frame_dir,
+ timestamp,
+ clip_len,
+ frame_interval,
+ FPS=FPS)
+ result["proposals"] = proposals
+ result["scores"] = scores
+
+ new_result = data_process_pipeline(result)
+ proposals = new_result['proposals']
+
+ img_slow = new_result['imgs'][0]
+ img_slow = img_slow[np.newaxis, :]
+ img_fast = new_result['imgs'][1]
+ img_fast = img_fast[np.newaxis, :]
+
+ proposals = proposals[np.newaxis, :]
+
+ scores = scores[np.newaxis, :]
+
+ img_shape = np.asarray(new_result['img_shape'])
+ img_shape = img_shape[np.newaxis, :]
+
+ data = [
+ paddle.to_tensor(img_slow, dtype='float32'),
+ paddle.to_tensor(img_fast, dtype='float32'),
+ paddle.to_tensor(proposals, dtype='float32'),
+ paddle.to_tensor(img_shape, dtype='int32')
+ ]
+
+ person_num = proposals.shape[1]
+ person_num_list.append(person_num)
+
+ data_list.append(data)
+
+ self.human_detections = human_detections
+ self.person_num_list = person_num_list
+ self.timestamps = timestamps
+ self.frame_dir = frame_dir
+ self.detection_result_dir = detection_result_dir
+
+ return data_list
+
+ def postprocess(self, outputs, print_output=True):
+ """
+ output: list
+ """
+ predictions = []
+
+ assert len(self.person_num_list) == len(outputs)
+
+ #print("*** self.human_detections",len( self.human_detections))
+ #print("*** outputs",len( outputs))
+
+ index = 0
+ for t_index in range(len(self.timestamps)):
+ if self.human_detections[t_index] is None:
+ predictions.append(None)
+ continue
+
+ human_detection = self.human_detections[t_index]
+
+ output = outputs[index]
+ result = output #长度为类别个数,不包含背景
+
+ person_num = self.person_num_list[index]
+
+ index = index + 1
+
+ prediction = []
+
+ if human_detection is None:
+ predictions.append(None)
+ continue
+
+ # N proposals
+ for i in range(person_num):
+ prediction.append([])
+
+ # Perform action score thr
+ for i in range(len(result)): # for class
+ if i + 1 not in self.class_whitelist:
+ continue
+ for j in range(person_num):
+ if result[i][j, 4] > self.config.MODEL.head['action_thr']:
+ prediction[j].append(
+ (self.label_map[i + 1], result[i][j, 4]
+ )) # label_map is a dict, label index start from 1
+ predictions.append(prediction)
+
+ results = []
+ for human_detection, prediction in zip(self.human_detections,
+ predictions):
+ results.append(pack_result(human_detection, prediction))
+
+ def dense_timestamps(timestamps, n):
+ """Make it nx frames."""
+ old_frame_interval = (timestamps[1] - timestamps[0])
+ start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+ new_frame_inds = np.arange(
+ len(timestamps) * n) * old_frame_interval / n + start
+ return new_frame_inds.astype(np.int)
+
+ dense_n = int(self.predict_stepsize / self.output_stepsize) #30
+ frames = [
+ cv2.imread(self.frame_paths[i - 1])
+ for i in dense_timestamps(self.timestamps, dense_n)
+ ]
+
+ vis_frames = visualize(frames, results)
+
+ try:
+ import moviepy.editor as mpy
+ except ImportError:
+ raise ImportError('Please install moviepy to enable output file')
+
+ vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+ fps=self.output_fps)
+ vid.write_videofile(self.out_filename)
+ print("finish write !")
+
+ # delete tmp files and dirs
+ shutil.rmtree(self.frame_dir)
+ shutil.rmtree(self.detection_result_dir)