From e96aef74957254043d9c984385d9bc3a8f881ecc Mon Sep 17 00:00:00 2001
From: lihang <lihang94@huawei.com>
Date: Tue, 13 Sep 2022 11:03:23 +0800
Subject: [PATCH 01/10] =?UTF-8?q?[=E8=A5=BF=E5=AE=89=E4=BA=A4=E9=80=9A?=
 =?UTF-8?q?=E5=A4=A7=E5=AD=A6][=E9=AB=98=E6=A0=A1=E8=B4=A1=E7=8C=AE][Pytor?=
 =?UTF-8?q?ch][DilatedResidualNetworks]--=E5=88=9D=E6=AC=A1=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DilatedResidualNetworks/LICENSE           |   29 +
 .../DilatedResidualNetworks/README.md         |  259 +++++
 .../DilatedResidualNetworks/README_raw.md     |  205 ++++
 .../DilatedResidualNetworks/classify.py       |  342 ++++++
 .../data_transforms.py                        |  266 +++++
 .../datasets/cityscapes/README.md             |   14 +
 .../datasets/cityscapes/create_lists.sh       |    9 +
 .../datasets/cityscapes/info.json             |    1 +
 .../datasets/cityscapes/prepare_data.py       |  136 +++
 .../datasets/compute_mean_std.py              |   44 +
 .../DilatedResidualNetworks/drn.py            |  414 +++++++
 .../DilatedResidualNetworks/lib/Makefile      |   30 +
 .../DilatedResidualNetworks/lib/build.py      |   34 +
 .../lib/dense/__init__.py                     |    0
 .../lib/dense/batch_norm/__init__.py          |   12 +
 .../lib/functions/__init__.py                 |    0
 .../lib/functions/batchnormp.py               |  178 +++
 .../lib/modules/__init__.py                   |    0
 .../lib/modules/batchnormsync.py              |   64 +
 .../lib/src/batchnormp.c                      |  159 +++
 .../lib/src/batchnormp.h                      |   16 +
 .../lib/src/batchnormp_cuda.c                 |   55 +
 .../lib/src/batchnormp_cuda.h                 |   30 +
 .../lib/src/batchnormp_cuda_kernel.cu         |  363 ++++++
 .../lib/src/batchnormp_cuda_kernel.h          |   16 +
 .../lib/src/generic/batchnormp_cuda.cu        |  185 +++
 .../DilatedResidualNetworks/lib/test.py       |   54 +
 .../DilatedResidualNetworks/requirements.txt  |    6 +
 .../DilatedResidualNetworks/segment.py        | 1036 +++++++++++++++++
 .../DilatedResidualNetworks/test/env_npu.sh   |   81 ++
 .../test/train_eval_8p.sh                     |  122 ++
 .../test/train_finetune_1p.sh                 |  160 +++
 .../test/train_full_1p.sh                     |  133 +++
 .../test/train_full_8p.sh                     |  145 +++
 .../test/train_performance_1p.sh              |  133 +++
 .../test/train_performance_8p.sh              |  145 +++
 36 files changed, 4876 insertions(+)
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README_raw.md
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/README.md
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/create_lists.sh
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/info.json
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/prepare_data.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/Makefile
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/dense/__init__.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/dense/batch_norm/__init__.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/__init__.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/__init__.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.c
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.h
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.c
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.h
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.cu
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.h
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/generic/batchnormp_cuda.cu
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/requirements.txt
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/env_npu.sh
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
 create mode 100644 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
new file mode 100644
index 0000000000..fa92d33e58
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, Fisher Yu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
new file mode 100644
index 0000000000..4a393562f4
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
@@ -0,0 +1,259 @@
+# DilatedResidualNetworks for PyTorch
+
+-   [概述](概述.md)
+-   [准备训练环境](准备训练环境.md)
+-   [开始训练](开始训练.md)
+-   [训练结果展示](训练结果展示.md)
+-   [版本说明](版本说明.md)
+
+
+
+# 概述
+
+## 简述
+
+DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的网络，在残差网络的基础上加入了膨胀卷积。DRN是以Resnet为基础，提出了一个改进方法，在Resnet的top layers移除下采样层，这可以保持feature map的空间分辨率，同时针对空洞卷积产生的”gridding”问题提出来解决方法，这使得DRN在cityscapes中表现优于其他的分割模型。
+
+- 参考实现：
+
+  ```
+  url=https://github.com/fyu/drn
+
+  ```
+
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/contrib/cv/semantic_segmentation
+  ```
+  
+- 通过Git获取代码方法如下：
+
+  ```
+  git clone https://gitee.com/zhang_liangliang0727/ModelZoo-PyTorch.git      # 克隆仓库的代码
+  cd ./ModelZoo-PyTorch/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks  # 切换到模型代码所在路径，若仓库下只有该模型，则无需切换
+  ```
+  
+- 通过单击“立即下载”，下载源码包。
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。
+
+  **表 1**  版本配套表
+
+  | 配套       | 版本                                                         |
+  | ---------- | ------------------------------------------------------------ |
+  | 固件与驱动 | [1.0.15](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+  | CANN       | [5.1.RC1](https://www.hiascend.com/software/cann/commercial?version=5.1.RC1) |
+  | PyTorch    | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/)或[1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/) |
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖（根据模型需求，按需添加所需依赖）。
+
+  ```
+  pip install -r requirements.txt
+  ```
+
+
+## 准备数据集
+
+1. 获取数据集。
+
+   用户自行获取原始数据集cityscapes，将数据集上传到服务器并解压。
+
+   注：为方便运行，数据集应该存放在如下文件路径中：
+   ```
+   ./DilatedResidualNetworks/datasets/cityscapes
+   # 数据集软链接方式
+   ln -s <data_folder>/cityscapes/* <code_folder>/DilatedResidualNetworks/datasets/cityscapes/
+   ```
+   数据集目录结构参考如下所示:
+   ```
+    ├── cityscapes
+          ├──leftImg8bit
+               ├──train
+                     │──城市1
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │──城市2
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │   ...       
+               ├──val
+                     │──城市1
+                         │──图片1
+                         │──图片2
+                         │   ...  
+                     │──城市2
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │   ...   
+                ├──test
+                     │──城市1
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │──城市2
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │   ...                  
+          ├──gtFine
+               ├──train
+                     │──城市1
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │──城市2
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │   ...       
+               ├──val
+                     │──城市1
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │──城市2
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │   ...   
+                ├──test
+                     │──城市1
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │──城市2
+                         │──图片1
+                         │──图片2
+                         │   ... 
+                     │   ...            
+              
+   ```
+
+   > **说明：** 
+   >该数据集的训练过程脚本只作为一种参考示例。
+
+2. 数据预处理。
+- 进入解压后的源码包根目录。
+  ```
+  cd ./DilatedResidualNetworks
+  ```
+- 在获得普通版本的cityscape数据标签图后，首先将原始分割标签 id 转换为 19 个训练 id 之一。
+  ```
+  python3 datasets/cityscapes/prepare_data.py datasets/cityscapes/gtFine/
+  ```
+- 运行datasets目录中的create_lists.sh，为gtFine和leftImg8bit创建图像和标签列表。
+  ```
+  sh create_lists.sh
+  ```
+  运行create_lists.sh后生成如下5个txt文件，名称如下：
+- train_images.txt
+- train_labels.txt
+- val_images.txt
+- val_labels.txt
+- test_images.txt
+
+## 获取预训练模型
+
+可以在此处下载预训练 DRN 模型：http://dl.yf.io/drn/drn_c_26-ddedf421.pth，放入/root/.cache/torch/hub/ checkpoints/
+
+# 开始训练
+
+## 训练模型
+
+1. 进入解压后的源码包根目录。
+
+   ```
+   cd ./DilatedResidualNetworks
+   ```
+
+2. 运行训练脚本。
+   注：需要在代码根目录路径下新建一个名为checkpoints的文件夹来存放pth模型文件。
+   该模型支持单机单卡训练和单机8卡训练。
+
+   - 单机单卡训练
+
+     启动单卡训练。
+
+     ```
+     bash ./test/train_performance_1p.sh --data_path='./datasets/cityscapes'  # 1p性能
+     bash ./test/train_full_1p.sh --data_path='./datasets/cityscapes'         # 1p精度 
+     bash ./test/train_finetune_1p.sh --data_path='./datasets/cityscapes' --pth_path='./checkpoints/xxx.pth.tar'  # 1p模型微调
+     ```
+
+   - 单机8卡训练
+
+     启动8卡训练。
+     注：精度通过运行8p精度后，运行8p验证获得精度。
+
+     ```
+     bash ./test/train_performance_8p.sh --data_path='./datasets/cityscapes'  # 8p性能
+     bash ./test/train_full_8p.sh --data_path='./datasets/cityscapes'         # 8p精度 完成250个epoch训练大约5h
+     bash ./test/train_eval_8p.sh --data_path='./datasets/cityscapes' --pth_path='./checkpoints/checkpoint_latest.pth.tar'    # 8p验证 
+     ```
+
+   --data\_path参数填写数据集路径。
+
+   模型训练脚本参数说明如下。
+
+   ```
+   公共参数：
+   --data-dir                          //数据集路径
+   --classes                           //分割类别数，Cityscapes为19
+   --workers                           //加载数据的线程数
+   --crop-size                         //裁剪尺寸
+   --arch                              //使用模型   
+   --batch-size                        //训练批次大小  
+   --epochs                            //重复训练次数
+   --lr                                //初始学习率，默认：0.01
+   --momentum                          //动量，默认：0.9
+   --save_iter                         //生成pth间隔
+   --save_path                         //pth文件保存路径
+   --device                            //使用设备，gpu或npu
+   --dist-backend                      //多机通信后端
+   --amp                               //使用混合精度
+   --loss-scale                        //混合精度参数
+   --opt-level                         //混合精度参数
+   --gpu                               //使用gpu编号
+   多卡训练参数：
+   --multiprocessing-distributed       //是否使用多卡训练
+   --device_list '0,1,2,3,4,5,6,7'     //多卡训练指定训练用卡
+   ```
+   
+   训练完成后，权重文件保存在当前路径下，并输出模型训练精度和性能信息。
+
+# 训练结果展示
+
+**表 2**  训练结果展示表
+
+| NAME    | MAP |  FPS | Epochs | AMP_Type |
+| :-------: | :-----: | :------: | :------: | :-------: |
+| 1p-竞品 | -     |  12.536 | 1      |        - |
+| 1p-NPU  | -     |  19.868 | 1      |       O2 |
+| 8p-竞品 | 68.67 | 100.221 | 250    |        - |
+| 8p-NPU  | 68.64 | 125.545 | 250    |       O2 |
+
+
+# 版本说明
+2022.09.12：首次发布。
+
+## 已知问题
+无。
+
+
+
+
+
+
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README_raw.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README_raw.md
new file mode 100644
index 0000000000..1b4fa56270
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README_raw.md
@@ -0,0 +1,205 @@
+## Overview
+
+This code provides various models combining dilated convolutions with residual networks. Our models can achieve better performance with less parameters than ResNet on [image classification](#image-classification) and [semantic segmentation](#semantic-image-segmentataion).
+
+If you find this code useful for your publications, please consider citing
+
+```
+@inproceedings{Yu2017,
+    title     = {Dilated Residual Networks},
+    author    = {Fisher Yu and Vladlen Koltun and Thomas Funkhouser},
+    booktitle = {Computer Vision and Pattern Recognition (CVPR)},
+    year      = {2017},
+}
+
+@inproceedings{Yu2016,
+    title     = {Multi-scale context aggregation by dilated convolutions},
+    author    = {Yu, Fisher and Koltun, Vladlen},
+    booktitle = {International Conference on Learning Representations (ICLR)},
+    year      = {2016}
+}
+```
+
+## Code Highlights
+
+- The pretrained model can be loaded using Pytorch model zoo api. [Example here](https://github.com/fyu/drn/blob/master/drn.py#L264).
+- Pytorch based image classification and semantic image segmentation.
+- BatchNorm synchronization across multipe GPUs.
+- High-resolution class activiation maps for state-of-the-art weakly supervised object localization.
+- [DRN-D-105](#semantic-image-segmentataion) gets 76.3% mIoU on Cityscapes with only fine training annotation and no context module.
+
+## Image Classification
+
+Image classification is meant to be a controlled study to understand the role of high resolution feature maps in image classification and the class activations rising from it. Based on the investigation, we are able to design more efficient networks for learning high-resolution image representation. They have practical usage in semantic image segmentation, as detailed in [image segmentation section](#semantic-image-segmentataion).
+
+### Models
+
+Comparison of classification error rate on ImageNet validation set and numbers of parameters. It is evaluated on single center 224x224 crop from resized images whose shorter side is 256-pixel long.
+
+| Name | Top-1 | Top-5 | Params |
+| --- | :---: | :---: | :---: |
+| ResNet-18 | 30.4% | 10.8% | 11.7M |
+| DRN-A-18 | 28.0% | 9.5% | 11.7M |
+| DRN-D-22 | 25.8% | 8.2% |16.4M |
+| DRN-C-26 | 24.9% | 7.6% |21.1M |
+| ResNet-34 | 27.7% | 8.7% | 21.8M |
+| DRN-A-34 | 24.8% | 7.5% | 21.8M|
+| DRN-D-38 | 23.8% | 6.9% |26.5M |
+| DRN-C-42 | 22.9% | 6.6% |31.2M |
+| ResNet-50 | 24.0% | 7.0% | 25.6M |
+| DRN-A-50 | 22.9% | 6.6% | 25.6M |
+| DRN-D-54 | 21.2% | 5.9% | 35.8M |
+| DRN-C-58 | 21.7% | 6.0% | 41.6M |
+| ResNet-101 | 22.4% | 6.2% | 44.5M |
+| DRN-D-105 |  20.6% | 5.5% | 54.8M |
+| ResNet-152 | 22.2% | 6.2% | 60.2M |
+
+The figure below groups the parameter and error rate comparison based on netwok structures.
+
+![comparison](doc/drn_comp.png)
+
+
+### Training and Testing
+
+The code is written in Python using [Pytorch](https://github.com/pytorch/pytorch). I started with code in [torchvision](https://github.com/pytorch/vision). Please check their license as well if copyright is your concern. Software dependency:
+
+* Python 3
+* Pillow
+* pytorch
+* torchvision
+
+**Note** If you want to train your own semantic segmentation model, make sure your Pytorch version is greater than [0.2.0](https://github.com/pytorch/pytorch/releases) or includes commit [78020a](https://github.com/pytorch/pytorch/pull/2077/commits/78020a52abb76fcb1c344b3c42fbe8610cc387e4).
+
+Go to [this page](https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset) to prepare ImageNet 1K data.
+
+To test a model on ImageNet validation set:
+```
+python3 classify.py test --arch drn_c_26 -j 4 <imagenet dir> --pretrained
+```
+
+To train a new model:
+```
+python3 classify.py train --arch drn_c_26 -j 8 <imagenet dir> --epochs 120
+```
+
+Besides `drn_c_26`, we also provide `drn_c_42` and `drn_c_58`. They are in DRN-C family as described in [Dilated Residual Networks](https://umich.app.box.com/v/drn). DRN-D models are simplified versions of DRN-C. Their code names are `drn_d_22`, `drn_d_38`, `drn_d_54`, and `drn_d_105`.
+
+## Semantic Image Segmentataion
+
+### Models
+
+Comparison of mIoU on Cityscapes and numbers of parameters.
+
+| Name | mIoU | Params |
+| --- | :---: | :---: |
+| DRN-A-50 | 67.3% | 25.6M |
+| DRN-C-26 | 68.0% | 21.1M |
+| DRN-C-42 | 70.9% | 31.2M |
+| DRN-D-22 | 68.0% | 16.4M |
+| DRN-D-38 | 71.4% | 26.5M |
+| DRN-D-105* | 75.6% | 54.8M |
+
+*trained with poly learning rate, random scaling and rotations.
+
+DRN-D-105 gets 76.3% mIoU on Cityscapes testing set with multi-scale testing, poly learning rate and data augmentation with random rotation and scaling in training. Full results are [here](datasets/cityscapes/drn-d-105.csv).
+
+### Prepare Data
+
+The segmentation image data folder is supposed to contain following image lists with names below:
+
+* train_images.txt
+* train_labels.txt
+* val_images.txt
+* val_labels.txt
+* test_images.txt
+
+The code will also look for `info.json` in the folder. It contains mean and std of the training images. For example, below is `info.json` used for training on Cityscapes.
+
+```
+{
+    "mean": [
+        0.290101,
+        0.328081,
+        0.286964
+    ],
+    "std": [
+        0.182954,
+        0.186566,
+        0.184475
+    ]
+}
+```
+
+Each line in the list is a path to an input image or its label map relative to the segmentation folder.
+
+For example, if the data folder is "/foo/bar" and train_images.txt in it contains
+```
+leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png
+leftImg8bit/train/aachen/aachen_000001_000019_leftImg8bit.png
+```
+and train_labels.txt contrains
+```
+gtFine/train/aachen/aachen_000000_000019_gtFine_trainIds.png
+gtFine/train/aachen/aachen_000001_000019_gtFine_trainIds.png
+```
+Then the first image path is expected at
+```
+/foo/bar/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png
+```
+and its label map is at
+```
+/foo/bar/gtFine/train/aachen/aachen_000000_000019_gtFine_trainIds.png
+```
+
+In training phase, both train_\* and val_\* are assumed to be in the data folder. In validation phase, only val_images.txt and val_labels.txt are needed. In testing phase, when there are no available labels, only test_images.txt is needed. `segment.py` has a command line option `--phase` and the corresponding acceptable arguments are `train`, `val`, and `test`.
+
+To set up Cityscapes data, please check this [document](datasets/cityscapes).
+
+### Optimization Setup
+
+The current segmentation models are trained on basic data augmentation (random crops + flips). The learning rate is changed by steps, where it is decreased by a factor of 10 at each step.
+
+### Training
+
+To train a new model, use
+```
+python3 segment.py train -d <data_folder> -c <category_number> -s 896 \
+    --arch drn_d_22 --batch-size 32 --epochs 250 --lr 0.01 --momentum 0.9 \
+    --step 100
+```
+
+`category_number` is the number of categories in segmentation. It is 19 for Cityscapes and 11 for Camvid. The actual label maps should contain values in the range of `[0, category_number)`. Invalid pixels can be labeled as 255 and they will be ignored in training and evaluation. Depends on the batch size, lr and momentum can be 0.01/0.9 or 0.001/0.99.
+
+If you want to train drn_d_105 to achieve best results on cityscapes dataset, you need to turn on data augmentation and use poly learning rate:
+
+```
+python3 segment.py train -d <data_folder> -c 19 -s 840 --arch drn_d_105 --random-scale 2 --random-rotate 10 --batch-size 16 --epochs 500 --lr 0.01 --momentum 0.9 -j 16 --lr-mode poly --bn-sync
+```
+
+Note:
+
+ - If you use 8 GPUs for 16 crops per batch, the memory for each GPU is more than 12GB. If you don't have enough GPU memory, you can try smaller batch size or crop size. Smaller crop size usually hurts the performance more.
+ - Batch normalization synchronization across multiple GPUs is necessary to train very deep convolutional networks for semantic segmentation. We provide an implementation as a pytorch extenstion in `lib/`. However, it is not for the faint-hearted to build from scratch, although an Makefile is provided. So a built binary library for 64-bit Ubuntu is provided. It is tested on Ubuntu 16.04. Also remember to add `lib/` to your `PYTHONPATH`.
+
+### Testing
+
+Evaluate models on testing set or any images without ground truth labels using our related pretrained model:
+```
+python3 segment.py test -d <data_folder> -c <category_number> --arch drn_d_22 \
+    --pretrained <model_path> --phase test --batch-size 1
+```
+
+You can download the pretrained DRN models on Cityscapes here: http://go.yf.io/drn-cityscapes-models.
+
+If you want to evaluate a checkpoint from your own training, use `--resume` instead of `--pretrained`:
+```
+python3 segment.py test -d <data_folder> -c <category_number> --arch drn_d_22 \
+    --resume <model_path> --phase test --batch-size 1
+```
+
+You can also turn on multi-scale testing for better results by adding `--ms`:
+
+```
+python3 segment.py test -d <data_folder> -c <category_number> --arch drn_d_105 \
+    --resume <model_path> --phase val --batch-size 1 --ms
+```
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py
new file mode 100644
index 0000000000..dd123d0052
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py
@@ -0,0 +1,342 @@
+import argparse
+import shutil
+import time
+
+import numpy as np
+import os
+from os.path import exists, split, join, splitext
+
+import sys
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import drn as models
+
+model_names = sorted(name for name in models.__dict__
+    if name.islower() and not name.startswith("__")
+    and callable(models.__dict__[name]))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('cmd', choices=['train', 'test', 'map', 'locate'])
+    parser.add_argument('data', metavar='DIR',
+                        help='path to dataset')
+    parser.add_argument('--arch', '-a', metavar='ARCH', default='drn18',
+                        choices=model_names,
+                        help='model architecture: ' +
+                            ' | '.join(model_names) +
+                            ' (default: drn18)')
+    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                        help='number of data loading workers (default: 4)')
+    parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                        help='manual epoch number (useful on restarts)')
+    parser.add_argument('-b', '--batch-size', default=256, type=int,
+                        metavar='N', help='mini-batch size (default: 256)')
+    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                        metavar='LR', help='initial learning rate')
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)')
+    parser.add_argument('--print-freq', '-p', default=10, type=int,
+                        metavar='N', help='print frequency (default: 10)')
+    parser.add_argument('--check-freq', default=10, type=int,
+                        metavar='N', help='checkpoint frequency (default: 10)')
+    parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                        help='path to latest checkpoint (default: none)')
+    parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                        help='evaluate model on validation set')
+    parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                        help='use pre-trained model')
+    parser.add_argument('--lr-adjust', dest='lr_adjust',
+                        choices=['linear', 'step'], default='step')
+    parser.add_argument('--crop-size', dest='crop_size', type=int, default=224)
+    parser.add_argument('--scale-size', dest='scale_size', type=int, default=256)
+    parser.add_argument('--step-ratio', dest='step_ratio', type=float, default=0.1)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    print(' '.join(sys.argv))
+    args = parse_args()
+    print(args)
+    if args.cmd == 'train':
+        run_training(args)
+    elif args.cmd == 'test':
+        test_model(args)
+
+
+def run_training(args):
+    # create model
+    model = models.__dict__[args.arch](args.pretrained)
+
+    model = torch.nn.DataParallel(model).cuda()
+
+    best_prec1 = 0
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint['epoch']
+            best_prec1 = checkpoint['best_prec1']
+            model.load_state_dict(checkpoint['state_dict'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(traindir, transforms.Compose([
+            transforms.RandomSizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=True,
+        num_workers=args.workers, pin_memory=True)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Scale(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True)
+
+    # define loss function (criterion) and pptimizer
+    criterion = nn.CrossEntropyLoss().cuda()
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    for epoch in range(args.start_epoch, args.epochs):
+        adjust_learning_rate(args, optimizer, epoch)
+
+        # train for one epoch
+        train(args, train_loader, model, criterion, optimizer, epoch)
+
+        # evaluate on validation set
+        prec1 = validate(args, val_loader, model, criterion)
+
+        # remember best prec@1 and save checkpoint
+        is_best = prec1 > best_prec1
+        best_prec1 = max(prec1, best_prec1)
+
+        checkpoint_path = 'checkpoint_latest.pth.tar'
+        save_checkpoint({
+            'epoch': epoch + 1,
+            'arch': args.arch,
+            'state_dict': model.state_dict(),
+            'best_prec1': best_prec1,
+        }, is_best, filename=checkpoint_path)
+        if (epoch + 1) % args.check_freq == 0:
+            history_path = 'checkpoint_{:03d}.pth.tar'.format(epoch + 1)
+            shutil.copyfile(checkpoint_path, history_path)
+
+
+def test_model(args):
+    # create model
+    model = models.__dict__[args.arch](args.pretrained)
+
+    model = torch.nn.DataParallel(model).cuda()
+
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint['epoch']
+            best_prec1 = checkpoint['best_prec1']
+            model.load_state_dict(checkpoint['state_dict'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    # Data loading code
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    t = transforms.Compose([
+        transforms.Scale(args.scale_size),
+        transforms.CenterCrop(args.crop_size),
+        transforms.ToTensor(),
+        normalize])
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, t),
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True)
+
+    criterion = nn.CrossEntropyLoss().cuda()
+
+    validate(args, val_loader, model, criterion)
+
+
+def train(args, train_loader, model, criterion, optimizer, epoch):
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        target = target.cuda(async=True)
+        input_var = torch.autograd.Variable(input)
+        target_var = torch.autograd.Variable(target)
+
+        # compute output
+        output = model(input_var)
+        loss = criterion(output, target_var)
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+        losses.update(loss.data[0], input.size(0))
+        top1.update(prec1[0], input.size(0))
+        top5.update(prec5[0], input.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            print('Epoch: [{0}][{1}/{2}]\t'
+                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                   epoch, i, len(train_loader), batch_time=batch_time,
+                   data_time=data_time, loss=losses, top1=top1, top5=top5))
+
+
+def validate(args, val_loader, model, criterion):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to evaluate mode
+    model.eval()
+
+    end = time.time()
+    for i, (input, target) in enumerate(val_loader):
+        target = target.cuda(async=True)
+        input_var = torch.autograd.Variable(input, volatile=True)
+        target_var = torch.autograd.Variable(target, volatile=True)
+
+        # compute output
+        output = model(input_var)
+        loss = criterion(output, target_var)
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+        losses.update(loss.data[0], input.size(0))
+        top1.update(prec1[0], input.size(0))
+        top5.update(prec5[0], input.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            print('Test: [{0}/{1}]\t'
+                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                   i, len(val_loader), batch_time=batch_time, loss=losses,
+                   top1=top1, top5=top5))
+
+    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
+          .format(top1=top1, top5=top5))
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def adjust_learning_rate(args, optimizer, epoch):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = args.lr * (args.step_ratio ** (epoch // 30))
+    print('Epoch [{}] Learning rate: {}'.format(epoch, lr))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py
new file mode 100644
index 0000000000..616881c594
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py
@@ -0,0 +1,266 @@
+import numbers
+import random
+
+import numpy as np
+from PIL import Image, ImageOps
+import torch
+
+
+class RandomCrop(object):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, image, label, *args):
+        assert label is None or image.size == label.size, \
+            "image and label doesn't have the same size {} / {}".format(
+                image.size, label.size)
+
+        w, h = image.size
+        tw, th = self.size
+        top = bottom = left = right = 0
+        if w < tw:
+            left = (tw - w) // 2
+            right = tw - w - left
+        if h < th:
+            top = (th - h) // 2
+            bottom = th - h - top
+        if left > 0 or right > 0 or top > 0 or bottom > 0:
+            label = pad_image(
+                'constant', label, top, bottom, left, right, value=255)
+            image = pad_image(
+                'reflection', image, top, bottom, left, right)
+        w, h = image.size
+        if w == tw and h == th:
+            return (image, label, *args)
+
+        x1 = random.randint(0, w - tw)
+        y1 = random.randint(0, h - th)
+        results = [image.crop((x1, y1, x1 + tw, y1 + th))]
+        if label is not None:
+            results.append(label.crop((x1, y1, x1 + tw, y1 + th)))
+        results.extend(args)
+        return results
+
+
+class RandomScale(object):
+    def __init__(self, scale):
+        if isinstance(scale, numbers.Number):
+            scale = [1 / scale, scale]
+        self.scale = scale
+
+    def __call__(self, image, label):
+        ratio = random.uniform(self.scale[0], self.scale[1])
+        w, h = image.size
+        tw = int(ratio * w)
+        th = int(ratio * h)
+        if ratio == 1:
+            return image, label
+        elif ratio < 1:
+            interpolation = Image.ANTIALIAS
+        else:
+            interpolation = Image.CUBIC
+        return image.resize((tw, th), interpolation), \
+               label.resize((tw, th), Image.NEAREST)
+
+
+class RandomRotate(object):
+    """Crops the given PIL.Image at a random location to have a region of
+    the given size. size can be a tuple (target_height, target_width)
+    or an integer, in which case the target will be of a square shape (size, size)
+    """
+
+    def __init__(self, angle):
+        self.angle = angle
+
+    def __call__(self, image, label=None, *args):
+        assert label is None or image.size == label.size
+
+        w, h = image.size
+        p = max((h, w))
+        angle = random.randint(0, self.angle * 2) - self.angle
+
+        if label is not None:
+            label = pad_image('constant', label, h, h, w, w, value=255)
+            label = label.rotate(angle, resample=Image.NEAREST)
+            label = label.crop((w, h, w + w, h + h))
+
+        image = pad_image('reflection', image, h, h, w, w)
+        image = image.rotate(angle, resample=Image.BILINEAR)
+        image = image.crop((w, h, w + w, h + h))
+        return image, label
+
+
+class RandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+
+    def __call__(self, image, label):
+        if random.random() < 0.5:
+            results = [image.transpose(Image.FLIP_LEFT_RIGHT),
+                       label.transpose(Image.FLIP_LEFT_RIGHT)]
+        else:
+            results = [image, label]
+        return results
+
+
+class Normalize(object):
+    """Given mean: (R, G, B) and std: (R, G, B),
+    will normalize each channel of the torch.*Tensor, i.e.
+    channel = (channel - mean) / std
+    """
+
+    def __init__(self, mean, std):
+        self.mean = torch.FloatTensor(mean)
+        self.std = torch.FloatTensor(std)
+
+    def __call__(self, image, label=None):
+        for t, m, s in zip(image, self.mean, self.std):
+            t.sub_(m).div_(s)
+        if label is None:
+            return image,
+        else:
+            return image, label
+
+
+def pad_reflection(image, top, bottom, left, right):
+    if top == 0 and bottom == 0 and left == 0 and right == 0:
+        return image
+    h, w = image.shape[:2]
+    next_top = next_bottom = next_left = next_right = 0
+    if top > h - 1:
+        next_top = top - h + 1
+        top = h - 1
+    if bottom > h - 1:
+        next_bottom = bottom - h + 1
+        bottom = h - 1
+    if left > w - 1:
+        next_left = left - w + 1
+        left = w - 1
+    if right > w - 1:
+        next_right = right - w + 1
+        right = w - 1
+    new_shape = list(image.shape)
+    new_shape[0] += top + bottom
+    new_shape[1] += left + right
+    new_image = np.empty(new_shape, dtype=image.dtype)
+    new_image[top:top+h, left:left+w] = image
+    new_image[:top, left:left+w] = image[top:0:-1, :]
+    new_image[top+h:, left:left+w] = image[-1:-bottom-1:-1, :]
+    new_image[:, :left] = new_image[:, left*2:left:-1]
+    new_image[:, left+w:] = new_image[:, -right-1:-right*2-1:-1]
+    return pad_reflection(new_image, next_top, next_bottom,
+                          next_left, next_right)
+
+
+def pad_constant(image, top, bottom, left, right, value):
+    if top == 0 and bottom == 0 and left == 0 and right == 0:
+        return image
+    h, w = image.shape[:2]
+    new_shape = list(image.shape)
+    new_shape[0] += top + bottom
+    new_shape[1] += left + right
+    new_image = np.empty(new_shape, dtype=image.dtype)
+    new_image.fill(value)
+    new_image[top:top+h, left:left+w] = image
+    return new_image
+
+
+def pad_image(mode, image, top, bottom, left, right, value=0):
+    if mode == 'reflection':
+        return Image.fromarray(
+            pad_reflection(np.asarray(image), top, bottom, left, right))
+    elif mode == 'constant':
+        return Image.fromarray(
+            pad_constant(np.asarray(image), top, bottom, left, right, value))
+    else:
+        raise ValueError('Unknown mode {}'.format(mode))
+
+
+class Pad(object):
+    """Pads the given PIL.Image on all sides with the given "pad" value"""
+
+    def __init__(self, padding, fill=0):
+        assert isinstance(padding, numbers.Number)
+        assert isinstance(fill, numbers.Number) or isinstance(fill, str) or \
+               isinstance(fill, tuple)
+        self.padding = padding
+        self.fill = fill
+
+    def __call__(self, image, label=None, *args):
+        if label is not None:
+            label = pad_image(
+                'constant', label,
+                self.padding, self.padding, self.padding, self.padding,
+                value=255)
+        if self.fill == -1:
+            image = pad_image(
+                'reflection', image,
+                self.padding, self.padding, self.padding, self.padding)
+        else:
+            image = pad_image(
+                'constant', image,
+                self.padding, self.padding, self.padding, self.padding,
+                value=self.fill)
+        return (image, label, *args)
+
+
+class PadImage(object):
+    def __init__(self, padding, fill=0):
+        assert isinstance(padding, numbers.Number)
+        assert isinstance(fill, numbers.Number) or isinstance(fill, str) or \
+               isinstance(fill, tuple)
+        self.padding = padding
+        self.fill = fill
+
+    def __call__(self, image, label=None, *args):
+        if self.fill == -1:
+            image = pad_image(
+                'reflection', image,
+                self.padding, self.padding, self.padding, self.padding)
+        else:
+            image = ImageOps.expand(image, border=self.padding, fill=self.fill)
+        return (image, label, *args)
+
+
+class ToTensor(object):
+    """Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
+    """
+
+    def __call__(self, pic, label=None):
+        if isinstance(pic, np.ndarray):
+            # handle numpy array
+            img = torch.from_numpy(pic)
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+            # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+            if pic.mode == 'YCbCr':
+                nchannel = 3
+            else:
+                nchannel = len(pic.mode)
+            img = img.view(pic.size[1], pic.size[0], nchannel)
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.float().div(255)
+        if label is None:
+            return img,
+        else:
+            return img, torch.LongTensor(np.array(label, dtype=np.int))
+
+
+class Compose(object):
+    """Composes several transforms together.
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, *args):
+        for t in self.transforms:
+            args = t(*args)
+        return args
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/README.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/README.md
new file mode 100644
index 0000000000..adbad66e89
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/README.md
@@ -0,0 +1,14 @@
+## Prepare Cityscapes training data
+
+### Step 1
+
+After you get a vanilla version of Cityscape data label maps, first convert the original segmentation label ids to one of 19 training ids:
+
+```
+python3 datasets/cityscapes/prepare_data.py <cityscape folder>/gtFine/
+```
+
+### Step 2
+
+- Run `create_lists.sh` in cityscape data folder, containing `gtFine` and `leftImg8bit` to create image and label lists.
+- Move [info.json](info.json) to the data folder.
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/create_lists.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/create_lists.sh
new file mode 100644
index 0000000000..be4af3e954
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/create_lists.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+find leftImg8bit/train -maxdepth 3 -name "*_leftImg8bit.png" | sort > train_images.txt
+find leftImg8bit/val -maxdepth 3 -name "*_leftImg8bit.png" | sort > val_images.txt
+find leftImg8bit/test -maxdepth 3 -name "*_leftImg8bit.png" | sort > test_images.txt
+
+find gtFine/train -maxdepth 3 -name "*_trainIds.png" | sort > train_labels.txt
+find gtFine/val -maxdepth 3 -name "*_trainIds.png" | sort > val_labels.txt
+
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/info.json b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/info.json
new file mode 100644
index 0000000000..aae1cc4cda
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/info.json
@@ -0,0 +1 @@
+{"std": [0.1829540508368939, 0.18656561047509476, 0.18447508988480435], "mean": [0.29010095242892997, 0.32808144844279574, 0.28696394422942517]}
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/prepare_data.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/prepare_data.py
new file mode 100644
index 0000000000..4958f18867
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/cityscapes/prepare_data.py
@@ -0,0 +1,136 @@
+from collections import namedtuple
+import os
+from os.path import join, split, exists
+import sys
+
+import numpy as np
+from PIL import Image
+
+# a label and all meta information
+Label = namedtuple( 'Label' , [
+
+    'name'        , # The identifier of this label, e.g. 'car', 'person', ... .
+                    # We use them to uniquely name a class
+
+    'id'          , # An integer ID that is associated with this label.
+                    # The IDs are used to represent the label in ground truth images
+                    # An ID of -1 means that this label does not have an ID and thus
+                    # is ignored when creating ground truth images (e.g. license plate).
+                    # Do not modify these IDs, since exactly these IDs are expected by the
+                    # evaluation server.
+
+    'trainId'     , # Feel free to modify these IDs as suitable for your method. Then create
+                    # ground truth images with train IDs, using the tools provided in the
+                    # 'preparation' folder. However, make sure to validate or submit results
+                    # to our evaluation server using the regular IDs above!
+                    # For trainIds, multiple labels might have the same ID. Then, these labels
+                    # are mapped to the same class in the ground truth images. For the inverse
+                    # mapping, we use the label that is defined first in the list below.
+                    # For example, mapping all void-type classes to the same ID in training,
+                    # might make sense for some approaches.
+                    # Max value is 255!
+
+    'category'    , # The name of the category that this label belongs to
+
+    'categoryId'  , # The ID of this category. Used to create ground truth images
+                    # on category level.
+
+    'hasInstances', # Whether this label distinguishes between single instances or not
+
+    'ignoreInEval', # Whether pixels having this class as ground truth label are ignored
+                    # during evaluations or not
+
+    'color'       , # The color of this label
+    ] )
+
+
+labels = [
+    #       name                     id    trainId   category            catId     hasInstances   ignoreInEval   color
+    Label(  'unlabeled'            ,  0 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+    Label(  'ego vehicle'          ,  1 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+    Label(  'rectification border' ,  2 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+    Label(  'out of roi'           ,  3 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+    Label(  'static'               ,  4 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+    Label(  'dynamic'              ,  5 ,      255 , 'void'            , 0       , False        , True         , (111, 74,  0) ),
+    Label(  'ground'               ,  6 ,      255 , 'void'            , 0       , False        , True         , ( 81,  0, 81) ),
+    Label(  'road'                 ,  7 ,        0 , 'flat'            , 1       , False        , False        , (128, 64,128) ),
+    Label(  'sidewalk'             ,  8 ,        1 , 'flat'            , 1       , False        , False        , (244, 35,232) ),
+    Label(  'parking'              ,  9 ,      255 , 'flat'            , 1       , False        , True         , (250,170,160) ),
+    Label(  'rail track'           , 10 ,      255 , 'flat'            , 1       , False        , True         , (230,150,140) ),
+    Label(  'building'             , 11 ,        2 , 'construction'    , 2       , False        , False        , ( 70, 70, 70) ),
+    Label(  'wall'                 , 12 ,        3 , 'construction'    , 2       , False        , False        , (102,102,156) ),
+    Label(  'fence'                , 13 ,        4 , 'construction'    , 2       , False        , False        , (190,153,153) ),
+    Label(  'guard rail'           , 14 ,      255 , 'construction'    , 2       , False        , True         , (180,165,180) ),
+    Label(  'bridge'               , 15 ,      255 , 'construction'    , 2       , False        , True         , (150,100,100) ),
+    Label(  'tunnel'               , 16 ,      255 , 'construction'    , 2       , False        , True         , (150,120, 90) ),
+    Label(  'pole'                 , 17 ,        5 , 'object'          , 3       , False        , False        , (153,153,153) ),
+    Label(  'polegroup'            , 18 ,      255 , 'object'          , 3       , False        , True         , (153,153,153) ),
+    Label(  'traffic light'        , 19 ,        6 , 'object'          , 3       , False        , False        , (250,170, 30) ),
+    Label(  'traffic sign'         , 20 ,        7 , 'object'          , 3       , False        , False        , (220,220,  0) ),
+    Label(  'vegetation'           , 21 ,        8 , 'nature'          , 4       , False        , False        , (107,142, 35) ),
+    Label(  'terrain'              , 22 ,        9 , 'nature'          , 4       , False        , False        , (152,251,152) ),
+    Label(  'sky'                  , 23 ,       10 , 'sky'             , 5       , False        , False        , ( 70,130,180) ),
+    Label(  'person'               , 24 ,       11 , 'human'           , 6       , True         , False        , (220, 20, 60) ),
+    Label(  'rider'                , 25 ,       12 , 'human'           , 6       , True         , False        , (255,  0,  0) ),
+    Label(  'car'                  , 26 ,       13 , 'vehicle'         , 7       , True         , False        , (  0,  0,142) ),
+    Label(  'truck'                , 27 ,       14 , 'vehicle'         , 7       , True         , False        , (  0,  0, 70) ),
+    Label(  'bus'                  , 28 ,       15 , 'vehicle'         , 7       , True         , False        , (  0, 60,100) ),
+    Label(  'caravan'              , 29 ,      255 , 'vehicle'         , 7       , True         , True         , (  0,  0, 90) ),
+    Label(  'trailer'              , 30 ,      255 , 'vehicle'         , 7       , True         , True         , (  0,  0,110) ),
+    Label(  'train'                , 31 ,       16 , 'vehicle'         , 7       , True         , False        , (  0, 80,100) ),
+    Label(  'motorcycle'           , 32 ,       17 , 'vehicle'         , 7       , True         , False        , (  0,  0,230) ),
+    Label(  'bicycle'              , 33 ,       18 , 'vehicle'         , 7       , True         , False        , (119, 11, 32) ),
+    Label(  'license plate'        , -1 ,       -1 , 'vehicle'         , 7       , False        , True         , (  0,  0,142) ),
+]
+
+
+def label2id(image):
+    array = np.array(image)
+    out_array = np.empty(array.shape, dtype=array.dtype)
+    for l in labels:
+        if 0 <= l.trainId < 255:
+            out_array[array == l.trainId] = l.id
+    return Image.fromarray(out_array)
+
+
+def id2label(image):
+    array = np.array(image)
+    out_array = np.empty(array.shape, dtype=array.dtype)
+    for l in labels:
+        out_array[array == l.id] = l.trainId
+    return Image.fromarray(out_array)
+
+
+def prepare_cityscape_submission(in_dir):
+    our_dir = in_dir + '_id'
+    for root, dirs, filenames in os.walk(in_dir):
+        for name in filenames:
+            in_path = join(root, name)
+            out_path = join(root.replace(in_dir, our_dir), name)
+            file_dir = split(out_path)[0]
+            if not exists(file_dir):
+                os.makedirs(file_dir)
+            image = Image.open(in_path)
+            id_map = label2id(image)
+            print('Writing', out_path)
+            id_map.save(out_path)
+
+
+def prepare_cityscape_training(in_dir):
+    for root, dirs, filenames in os.walk(in_dir):
+        for name in filenames:
+            parts = name.split('_')
+            if parts[-1] != 'labelIds.png':
+                continue
+            parts[-1] = 'trainIds.png'
+            out_name = '_'.join(parts)
+            in_path = join(root, name)
+            out_path = join(root, out_name)
+            image = Image.open(in_path)
+            id_map = id2label(image)
+            print('Writing', out_path)
+            id_map.save(out_path)
+
+
+if __name__ == '__main__':
+    prepare_cityscape_training(sys.argv[1])
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py
new file mode 100644
index 0000000000..df5ed75118
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py
@@ -0,0 +1,44 @@
+import argparse
+import json
+import numpy as np
+from PIL import Image
+from os import path as osp
+
+
+def compute_mean_std(data_dir, list_dir):
+    image_list_path = osp.join(list_dir, 'train_images.txt')
+    image_list = [line.strip() for line in open(image_list_path, 'r')]
+    np.random.shuffle(image_list)
+    pixels = []
+    for image_path in image_list[:500]:
+        image = Image.open(osp.join(data_dir, image_path), 'r')
+        pixels.append(np.asarray(image).reshape(-1, 3))
+    pixels = np.vstack(pixels)
+    mean = np.mean(pixels, axis=0) / 255
+    std = np.std(pixels, axis=0) / 255
+    print(mean, std)
+    info = {'mean': mean.tolist(), 'std': std.tolist()}
+    with open(osp.join(data_dir, 'info.json'), 'w') as fp:
+        json.dump(info, fp)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Compute mean and std of a dataset.')
+    parser.add_argument('data_dir', default='./', required=True,
+                        help='data folder where train_images.txt resides.')
+    parser.add_argument('list_dir', default=None, required=False,
+                        help='data folder where train_images.txt resides.')
+    args = parser.parse_args()
+    if args.list_dir is None:
+        args.list_dir = args.data_dir
+    return args
+
+
+def main():
+    args = parse_args()
+    compute_mean_std(args.data_dir, args.list_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py
new file mode 100644
index 0000000000..7201d0bc43
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py
@@ -0,0 +1,414 @@
+import pdb
+
+import torch.nn as nn
+import math
+import torch.utils.model_zoo as model_zoo
+
+BatchNorm = nn.BatchNorm2d
+
+
+# __all__ = ['DRN', 'drn26', 'drn42', 'drn58']
+
+
+webroot = 'http://dl.yf.io/drn/'
+
+model_urls = {
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'drn-c-26': webroot + 'drn_c_26-ddedf421.pth',
+    'drn-c-42': webroot + 'drn_c_42-9d336e8c.pth',
+    'drn-c-58': webroot + 'drn_c_58-0a53a92c.pth',
+    'drn-d-22': webroot + 'drn_d_22-4bd2f8ea.pth',
+    'drn-d-38': webroot + 'drn_d_38-eebb45f0.pth',
+    'drn-d-54': webroot + 'drn_d_54-0e0534ff.pth',
+    'drn-d-105': webroot + 'drn_d_105-12b40979.pth'
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, padding=1, dilation=1):
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=padding, bias=False, dilation=dilation)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 dilation=(1, 1), residual=True):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride,
+                             padding=dilation[0], dilation=dilation[0])
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes,
+                             padding=dilation[1], dilation=dilation[1])
+        self.bn2 = BatchNorm(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.residual = residual
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        if self.residual:
+            out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 dilation=(1, 1), residual=True):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation[1], bias=False,
+                               dilation=dilation[1])
+        self.bn2 = BatchNorm(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DRN(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000,
+                 channels=(16, 32, 64, 128, 256, 512, 512, 512),
+                 out_map=False, out_middle=False, pool_size=28, arch='D'):
+        super(DRN, self).__init__()
+        self.inplanes = channels[0]
+        self.out_map = out_map
+        self.out_dim = channels[-1]
+        self.out_middle = out_middle
+        self.arch = arch
+
+        if arch == 'C':
+            self.conv1 = nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                                   padding=3, bias=False)
+            self.bn1 = BatchNorm(channels[0])
+            self.relu = nn.ReLU(inplace=True)
+
+            self.layer1 = self._make_layer(
+                BasicBlock, channels[0], layers[0], stride=1)
+            self.layer2 = self._make_layer(
+                BasicBlock, channels[1], layers[1], stride=2)
+        elif arch == 'D':
+            self.layer0 = nn.Sequential(
+                nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3,
+                          bias=False),
+                BatchNorm(channels[0]),
+                nn.ReLU(inplace=True)
+            )
+
+            self.layer1 = self._make_conv_layers(
+                channels[0], layers[0], stride=1)
+            self.layer2 = self._make_conv_layers(
+                channels[1], layers[1], stride=2)
+
+        self.layer3 = self._make_layer(block, channels[2], layers[2], stride=2)
+        self.layer4 = self._make_layer(block, channels[3], layers[3], stride=2)
+        self.layer5 = self._make_layer(block, channels[4], layers[4],
+                                       dilation=2, new_level=False)
+        self.layer6 = None if layers[5] == 0 else \
+            self._make_layer(block, channels[5], layers[5], dilation=4,
+                             new_level=False)
+
+        if arch == 'C':
+            self.layer7 = None if layers[6] == 0 else \
+                self._make_layer(BasicBlock, channels[6], layers[6], dilation=2,
+                                 new_level=False, residual=False)
+            self.layer8 = None if layers[7] == 0 else \
+                self._make_layer(BasicBlock, channels[7], layers[7], dilation=1,
+                                 new_level=False, residual=False)
+        elif arch == 'D':
+            self.layer7 = None if layers[6] == 0 else \
+                self._make_conv_layers(channels[6], layers[6], dilation=2)
+            self.layer8 = None if layers[7] == 0 else \
+                self._make_conv_layers(channels[7], layers[7], dilation=1)
+
+        if num_classes > 0:
+            self.avgpool = nn.AvgPool2d(pool_size)
+            self.fc = nn.Conv2d(self.out_dim, num_classes, kernel_size=1,
+                                stride=1, padding=0, bias=True)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1,
+                    new_level=True, residual=True):
+        assert dilation == 1 or dilation % 2 == 0
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = list()
+        layers.append(block(
+            self.inplanes, planes, stride, downsample,
+            dilation=(1, 1) if dilation == 1 else (
+                dilation // 2 if new_level else dilation, dilation),
+            residual=residual))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, residual=residual,
+                                dilation=(dilation, dilation)))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_layers(self, channels, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(self.inplanes, channels, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                BatchNorm(channels),
+                nn.ReLU(inplace=True)])
+            self.inplanes = channels
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = list()
+
+        if self.arch == 'C':
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.relu(x)
+        elif self.arch == 'D':
+            x = self.layer0(x)
+
+        x = self.layer1(x)
+        y.append(x)
+        x = self.layer2(x)
+        y.append(x)
+
+        x = self.layer3(x)
+        y.append(x)
+
+        x = self.layer4(x)
+        y.append(x)
+
+        x = self.layer5(x)
+        y.append(x)
+
+        if self.layer6 is not None:
+            x = self.layer6(x)
+            y.append(x)
+
+        if self.layer7 is not None:
+            x = self.layer7(x)
+            y.append(x)
+
+        if self.layer8 is not None:
+            x = self.layer8(x)
+            y.append(x)
+
+        if self.out_map:
+            x = self.fc(x)
+        else:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            x = x.view(x.size(0), -1)
+
+        if self.out_middle:
+            return x, y
+        else:
+            return x
+
+
+class DRN_A(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 64
+        super(DRN_A, self).__init__()
+        self.out_dim = 512 * block.expansion
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
+                                       dilation=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                       dilation=4)
+        self.avgpool = nn.AvgPool2d(28, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         nn.init.constant_(m.weight, 1)
+        #         nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes,
+                                dilation=(dilation, dilation)))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def drn_a_50(pretrained=False, **kwargs):
+    model = DRN_A(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
+    return model
+
+
+def drn_c_26(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='C', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-c-26']))
+    return model
+
+
+def drn_c_42(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-c-42']))
+    return model
+
+
+def drn_c_58(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-c-58']))
+    return model
+
+
+def drn_d_22(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-22']))
+    return model
+
+
+def drn_d_24(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-24']))
+    return model
+
+
+def drn_d_38(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-38']))
+    return model
+
+
+def drn_d_40(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-40']))
+    return model
+
+
+def drn_d_54(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-54']))
+    return model
+
+
+def drn_d_56(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-56']))
+    return model
+
+
+def drn_d_105(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-105']))
+    return model
+
+
+def drn_d_107(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-107']))
+    return model
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/Makefile b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/Makefile
new file mode 100644
index 0000000000..1cd2a46751
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/Makefile
@@ -0,0 +1,30 @@
+PYTORCH_LIB_DIR := /home/fy/pytorch/torch/lib
+
+
+PYTHON := python3
+NVCC_COMPILE := nvcc -c -o
+RM_RF := rm -rf
+
+# Library compilation rules.
+NVCC_FLAGS := -x cu -Xcompiler -fPIC -shared
+
+# File structure.
+BUILD_DIR := dense
+INCLUDE_DIRS := TH THC THCUNN include include/TH
+TORCH_FFI_BUILD := build.py
+BN_KERNEL := $(BUILD_DIR)/batchnormp_kernel.so
+TORCH_FFI_TARGET := $(BUILD_DIR)/batch_norm/_batch_norm.so
+
+INCLUDE_FLAGS := $(foreach d, $(INCLUDE_DIRS), -I$(PYTORCH_LIB_DIR)/$d)
+
+all: $(TORCH_FFI_TARGET)
+
+$(TORCH_FFI_TARGET): $(BN_KERNEL) $(TORCH_FFI_BUILD)
+	$(PYTHON) $(TORCH_FFI_BUILD)
+
+$(BUILD_DIR)/batchnormp_kernel.so: src/batchnormp_cuda_kernel.cu
+	@mkdir -p $(BUILD_DIR)
+	$(NVCC_COMPILE) $@ $? $(NVCC_FLAGS) $(INCLUDE_FLAGS) -Isrc
+
+clean:
+	$(RM_RF) $(BUILD_DIR)
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py
new file mode 100644
index 0000000000..a6d7926117
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py
@@ -0,0 +1,34 @@
+import glob
+import os
+import torch
+from torch.utils.ffi import create_extension
+
+this_file = os.path.dirname(__file__)
+
+sources = ['src/batchnormp.c']
+headers = ['src/batchnormp.h']
+defines = []
+with_cuda = False
+
+abs_path = os.path.dirname(os.path.realpath(__file__))
+extra_objects = [os.path.join(abs_path, 'dense/batchnormp_kernel.so')]
+extra_objects += glob.glob('/usr/local/cuda/lib64/*.a')
+
+if torch.cuda.is_available():
+    print('Including CUDA code.')
+    sources += ['src/batchnormp_cuda.c']
+    headers += ['src/batchnormp_cuda.h']
+    defines += [('WITH_CUDA', None)]
+    with_cuda = True
+
+ffi = create_extension(
+    'dense.batch_norm',
+    headers=headers,
+    sources=sources,
+    define_macros=defines,
+    relative_to=__file__,
+    with_cuda=with_cuda,
+    extra_objects=extra_objects)
+
+if __name__ == '__main__':
+    ffi.build()
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/dense/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/dense/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/dense/batch_norm/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/dense/batch_norm/__init__.py
new file mode 100644
index 0000000000..cc38b19913
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/dense/batch_norm/__init__.py
@@ -0,0 +1,12 @@
+
+from torch.utils.ffi import _wrap_function
+from ._batch_norm import lib as _lib, ffi as _ffi
+
+__all__ = []
+def _import_symbols(locals):
+    for symbol in dir(_lib):
+        fn = getattr(_lib, symbol)
+        locals[symbol] = _wrap_function(fn, _ffi)
+        __all__.append(symbol)
+
+_import_symbols(locals())
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py
new file mode 100644
index 0000000000..cce2321ca7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py
@@ -0,0 +1,178 @@
+import pdb
+
+import numpy as np
+
+import torch
+from torch.autograd import Function
+from dense import batch_norm
+
+from queue import Queue
+from threading import Condition
+
+cum_queue = Queue()
+broadcast_queue = Queue()
+broadcast_cv = Condition()
+
+
+class BatchNormPFunction(Function):
+    def __init__(self, running_mean, running_var, training,
+                 cum_queue, broadcast_queue, device_ids, sync,
+                 eps=1e-5, momentum=0.1, affine=True):
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        self.running_mean = running_mean
+        self.running_var = running_var
+        self.mean = None
+        self.var = None
+        self.training = training
+        self.cum_queue = cum_queue
+        self.broadcast_queue = broadcast_queue
+        self.device_ids = device_ids
+        self.sync = sync
+
+    def forward(self, input, weight, bias):
+        output = input.new()
+        self.save_for_backward(input, weight, bias)
+
+        # input_t = input.transpose(0, 1).double()
+        # input_size = input_t.size()
+        batch_size = int(input.size(0))
+        # input_t.resize_(int(input_size[0]), int(np.prod(input_size[1:])))
+        # self.mean = input_t.mean(dim=1)
+
+        device_ids = self.device_ids
+        # print('device', input.get_device(), flush=True)
+        if input.is_cuda:
+            # self.mean.copy_(torch.from_numpy(
+            #     self.cum_mean(input.get_device(),
+            #                   self.mean.cpu().numpy(),
+            #                   batch_size)))
+            # var = input_t - torch.unsqueeze(self.mean, 1)
+            # var *= var
+            # var = var.mean(dim=1)
+            # total_var = self.cum_mean(
+            #     input.get_device(), var.cpu().numpy(), batch_size)
+            # self.std = input_t.new().resize_as_(self.mean). \
+            #     copy_(torch.from_numpy(total_var)).sqrt()
+
+            mean_cuda = input.new().resize_(input.size(1))
+            var_cuda = input.new().resize_(input.size(1))
+            batch_norm.BatchNormalizationP_mean_cuda(input, mean_cuda)
+
+            if len(device_ids) > 1 and self.sync and self.training:
+                mean_cuda.copy_(torch.from_numpy(self.cum_mean(
+                    input.get_device(), mean_cuda.cpu().numpy(), batch_size)))
+            batch_norm.BatchNormalizationP_var_cuda(input, mean_cuda, var_cuda)
+            if len(device_ids) > 1 and self.sync and self.training:
+                var_cuda.copy_(torch.from_numpy(self.cum_mean(
+                    input.get_device(), var_cuda.cpu().numpy(), batch_size)))
+        else:
+            # self.std = input_t.std(dim=1, unbiased=False)
+            batch_norm.BatchNormalizationP_var_cuda(input, mean_cuda, var_cuda)
+        self.mean = mean_cuda
+        self.var = var_cuda
+
+        if not input.is_cuda:
+            self.std = input_t.std(dim=1, unbiased=False)
+            batch_norm.BatchNormalizationP_forward(
+                input, output, weight, bias,
+                self.running_mean, self.running_var, self.mean, self.std,
+                self.training, self.momentum, self.eps)
+        else:
+            batch_norm.BatchNormalizationP_forward_cuda(
+                input, output, weight, bias,
+                self.running_mean, self.running_var, self.mean, self.var,
+                self.training, self.momentum, self.eps)
+        return output
+
+    def cum_mean(self, this_device, this_mean, batch_size):
+        cum_queue.put((batch_size, this_mean))
+        total_mean = np.zeros(this_mean.shape, dtype=np.float64)
+        total_batch_size = 0
+        if this_device == self.device_ids[0]:
+            for _ in self.device_ids:
+                item = cum_queue.get()
+                total_batch_size += item[0]
+                total_mean += item[0] * item[1]
+                cum_queue.task_done()
+            total_mean /= total_batch_size
+            broadcast_cv.acquire()
+            for _ in range(len(self.device_ids) - 1):
+                broadcast_queue.put(total_mean)
+            broadcast_cv.notify_all()
+            broadcast_cv.release()
+        else:
+            broadcast_cv.acquire()
+            if broadcast_queue.qsize() == 0:
+                broadcast_cv.wait()
+            total_mean = broadcast_queue.get()
+            broadcast_queue.task_done()
+            broadcast_cv.release()
+        # assert cum_queue.empty()
+        broadcast_queue.join()
+        return total_mean
+
+    def backward(self, grad_output):
+        input, weight, bias = self.saved_tensors
+        grad_input = grad_output.new().resize_as_(input)
+        grad_weight = grad_output.new().resize_as_(weight).zero_()
+        grad_bias = grad_output.new().resize_as_(bias).zero_()
+        if not grad_output.is_cuda:
+            batch_norm.BatchNormalizationP_backward(
+                input, grad_output, grad_input, grad_weight, grad_bias,
+                weight, self.running_mean, self.running_var, self.mean,
+                self.std, self.training, 1, self.eps)
+        else:
+            # grad_output_t = grad_output.transpose(0, 1).double()
+            # batch_size = int(grad_output.size(0))
+            # grad_output_t.resize_(int(grad_output_t.size(0)),
+            #                       int(np.prod(grad_output_t.size()[1:])))
+            # grad_output_mean = grad_output_t.mean(dim=1)
+            # device_ids = self.device_ids
+            # if len(device_ids) > 1 and self.sync:
+            #     grad_output_mean.copy_(torch.from_numpy(
+            #         self.cum_mean(grad_output.get_device(),
+            #                       grad_output_mean.cpu().numpy(),
+            #                       batch_size)))
+            # grad_output_mean = grad_output_mean.float()
+            #
+            # input_t = input.transpose(0, 1).double()
+            # input_size = input_t.size()
+            # input_t.resize_(int(input_size[0]), int(np.prod(input_size[1:])))
+            # dotP = (input_t - torch.unsqueeze(self.mean.double(), 1)) * \
+            #        grad_output_t
+            # dotP = dotP.mean(dim=1)
+            # if len(device_ids) > 1 and self.sync:
+            #     dotP.copy_(torch.from_numpy(
+            #         self.cum_mean(grad_output.get_device(),
+            #                       dotP.cpu().numpy(),
+            #                       batch_size)))
+            # dotP = dotP.float()
+
+            batch_size = int(grad_output.size(0))
+            grad_output_mean_cuda = grad_output.new().resize_(grad_output.size(1))
+            dotP_cuda = grad_output.new().resize_(
+                grad_output.size(1))
+            batch_norm.BatchNormalizationP_mean_grad_cuda(
+                input, grad_output, self.running_mean,
+                self.mean, grad_output_mean_cuda, dotP_cuda, self.training
+            )
+            if len(self.device_ids) > 1 and self.sync:
+                grad_output_mean_cuda.copy_(torch.from_numpy(
+                    self.cum_mean(grad_output.get_device(),
+                                  grad_output_mean_cuda.cpu().numpy(),
+                                  batch_size)))
+                dotP_cuda.copy_(torch.from_numpy(
+                    self.cum_mean(grad_output.get_device(),
+                                  dotP_cuda.cpu().numpy(),
+                                  batch_size)))
+
+            # pdb.set_trace()
+
+            batch_norm.BatchNormalizationP_backward_cuda(
+                input, grad_output, grad_output_mean_cuda, dotP_cuda,
+                grad_input, grad_weight, grad_bias,
+                weight, self.running_mean, self.running_var,
+                self.mean, self.var, self.training, 1, self.eps)
+        return grad_input, grad_weight, grad_bias
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/__init__.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py
new file mode 100644
index 0000000000..644249feca
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py
@@ -0,0 +1,64 @@
+from queue import Queue
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+from functions.batchnormp import BatchNormPFunction
+
+
+class BatchNormSync(Module):
+
+    sync = True
+    checking_mode = False
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
+                 device_ids=None):
+        super(BatchNormSync, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.mean = torch.zeros(num_features)
+        self.std = torch.ones(num_features)
+        self.reset_parameters()
+        self.cum_queue = Queue()
+        self.broadcast_queue = Queue()
+        if device_ids is None:
+            self.device_ids = list(range(torch.cuda.device_count()))
+        else:
+            self.device_ids = device_ids
+
+    def reset_parameters(self):
+        self.running_mean.zero_()
+        self.running_var.fill_(1)
+        self.mean.zero_()
+        self.std.fill_(1)
+        if self.affine:
+            if BatchNormSync.checking_mode:
+                self.weight.data.fill_(1)
+            else:
+                self.weight.data.uniform_()
+            self.bias.data.zero_()
+
+    def forward(self, input):
+        training = int(self.training)
+        assert input.size(1) == self.num_features
+
+        bn_func = BatchNormPFunction(
+            self.running_mean, self.running_var, # self.mean, self.std,
+            training, self.cum_queue, self.broadcast_queue, self.device_ids,
+            BatchNormSync.sync, self.eps, self.momentum, self.affine)
+        return bn_func(input, self.weight, self.bias)
+
+    def __repr__(self):
+        return ('{name}({num_features}, eps={eps}, momentum={momentum},'
+                ' affine={affine})'
+                .format(name=self.__class__.__name__, **self.__dict__))
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.c b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.c
new file mode 100644
index 0000000000..5619a53da6
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.c
@@ -0,0 +1,159 @@
+#include <TH/TH.h>
+#include "batchnormp.h"
+
+#define THNN_CHECK_SHAPE(I1, I2)			\
+  if (I1 != NULL && I2 != NULL && !THFloatTensor_isSameSizeAs(I1, I2))	\
+    {							\
+       THDescBuff s1 = THFloatTensor_sizeDesc(I1);		\
+       THDescBuff s2 = THFloatTensor_sizeDesc(I2);		\
+       THError(#I1 " and " #I2 " shapes do not match: "	\
+	       #I1 " %s, " #I2 " %s", s1.str, s2.str);	\
+    }
+
+void BatchNormalizationP_forward(
+  THFloatTensor *input, THFloatTensor *output,
+  THFloatTensor *weight, THFloatTensor *bias,
+  THFloatTensor *running_mean, THFloatTensor *running_var,
+  THFloatTensor *save_mean, THFloatTensor *save_std,
+  int train, double momentum, double eps)
+{
+  THFloatTensor_resizeAs(output, input);
+  int64_t nInput = THFloatTensor_size(input, 1);
+  int64_t f;
+  ptrdiff_t n = THFloatTensor_nElement(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THFloatTensor *in = THFloatTensor_newSelect(input, 1, f);
+    THFloatTensor *out = THFloatTensor_newSelect(output, 1, f);
+
+    float mean, invstd, std;
+
+    if (train) {
+      // compute mean per input
+//      double sum = 0;
+//      TH_TENSOR_APPLY(float, in, sum += *in_data;);
+//
+//      mean = (float) sum / n;
+//      THFloatTensor_set1d(save_mean, f, (float) mean);
+
+      mean = THFloatTensor_get1d(save_mean, f);
+      std = THFloatTensor_get1d(save_std, f);
+      invstd = (float) (1 / (std + eps));
+
+      // compute variance per input
+//      sum = 0;
+//      TH_TENSOR_APPLY(float, in,
+//        sum += (*in_data - mean) * (*in_data - mean););
+//
+//      if (sum == 0 && eps == 0.0) {
+//        invstd = 0;
+//      } else {
+//        invstd = (float) (1 / sqrt(sum/n + eps));
+//      }
+//      THFloatTensor_set1d(save_std, f, (float) invstd);
+
+      // update running averages
+      THFloatTensor_set1d(running_mean, f,
+        (float) (momentum * mean + (1 - momentum) * THFloatTensor_get1d(running_mean, f)));
+
+      double unbiased_var = std * n / (n - 1);
+      THFloatTensor_set1d(running_var, f,
+        (float) (momentum * unbiased_var + (1 - momentum) * THFloatTensor_get1d(running_var, f)));
+    } else {
+      mean = THFloatTensor_get1d(running_mean, f);
+      invstd = 1 / sqrt(THFloatTensor_get1d(running_var, f) + eps);
+    }
+
+    // compute output
+    float w = weight ? THFloatTensor_get1d(weight, f) : 1;
+    float b = bias ? THFloatTensor_get1d(bias, f) : 0;
+
+    TH_TENSOR_APPLY2(float, in, float, out,
+      *out_data = (float) (((*in_data - mean) * invstd) * w + b););
+
+    THFloatTensor_free(out);
+    THFloatTensor_free(in);
+  }
+}
+
+void BatchNormalizationP_backward(
+  THFloatTensor *input, THFloatTensor *gradOutput, THFloatTensor *gradInput,
+  THFloatTensor *gradWeight, THFloatTensor *gradBias, THFloatTensor *weight,
+  THFloatTensor *running_mean, THFloatTensor *running_var,
+  THFloatTensor *save_mean, THFloatTensor *save_std,
+  int train, double scale, double eps)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  int64_t nInput = THFloatTensor_size(input, 1);
+  int64_t f;
+  ptrdiff_t n = THFloatTensor_nElement(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THFloatTensor *in = THFloatTensor_newSelect(input, 1, f);
+    THFloatTensor *gradOut = THFloatTensor_newSelect(gradOutput, 1, f);
+    float w = weight ? THFloatTensor_get1d(weight, f) : 1;
+    float mean, invstd;
+    if (train) {
+      mean = THFloatTensor_get1d(save_mean, f);
+      invstd = 1 / (THFloatTensor_get1d(save_std, f) + eps);
+    } else {
+      mean = THFloatTensor_get1d(running_mean, f);
+      invstd = 1 / sqrt(THFloatTensor_get1d(running_var, f) + eps);
+    }
+
+    // sum over all gradOutput in feature plane
+    double sum = 0;
+    TH_TENSOR_APPLY(float, gradOut, sum += *gradOut_data;);
+
+    // dot product of the Q(X) and gradOuput
+    double dotp = 0;
+    TH_TENSOR_APPLY2(float, in, float, gradOut,
+      dotp += (*in_data - mean) * (*gradOut_data););
+
+    if (gradInput) {
+      THFloatTensor_resizeAs(gradInput, input);
+      THFloatTensor *gradIn = THFloatTensor_newSelect(gradInput, 1, f);
+
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+        // projection of gradOutput on to output scaled by std
+        float k = (float) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(float, gradIn, float, in,
+          *gradIn_data = (*in_data - mean) * k;);
+
+        double gradMean = sum / n;
+        TH_TENSOR_APPLY2(float, gradIn, float, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(float, gradIn, float, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
+
+      THFloatTensor_free(gradIn);
+    }
+
+    if (gradWeight) {
+      float val = THFloatTensor_get1d(gradWeight, f);
+      THFloatTensor_set1d(gradWeight, f, val + scale * dotp * invstd);
+    }
+
+    if (gradBias) {
+      float val = THFloatTensor_get1d(gradBias, f);
+      THFloatTensor_set1d(gradBias, f, val + scale * sum);
+    }
+
+    THFloatTensor_free(gradOut);
+    THFloatTensor_free(in);
+  }
+}
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.h b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.h
new file mode 100644
index 0000000000..a25ea06774
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp.h
@@ -0,0 +1,16 @@
+// #include <TH/TH.h>
+
+void BatchNormalizationP_forward(
+  THFloatTensor *input, THFloatTensor *output,
+  THFloatTensor *weight, THFloatTensor *bias,
+  THFloatTensor *running_mean, THFloatTensor *running_var,
+  THFloatTensor *save_mean, THFloatTensor *save_std,
+  int train, double momentum, double eps);
+
+
+void BatchNormalizationP_backward(
+  THFloatTensor *input, THFloatTensor *gradOutput, THFloatTensor *gradInput,
+  THFloatTensor *gradWeight, THFloatTensor *gradBias, THFloatTensor *weight,
+  THFloatTensor *running_mean, THFloatTensor *running_var,
+  THFloatTensor *save_mean, THFloatTensor *save_std,
+  int train, double scale, double eps);
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.c b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.c
new file mode 100644
index 0000000000..39c3efcdf7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.c
@@ -0,0 +1,55 @@
+// #include "auto_gpu.h"
+#include <THC/THC.h>
+
+  #include "batchnormp_cuda_kernel.h"
+
+
+extern THCState *state;
+
+void BatchNormalizationP_forward_cuda(
+  THCudaTensor *input, THCudaTensor *output,
+  THCudaTensor *weight, THCudaTensor *bias,
+  THCudaTensor *running_mean, THCudaTensor *running_var,
+  THCudaTensor *save_mean, THCudaTensor *save_std,
+  int train, double momentum, double eps) {
+  THNN_CudaBatchNormalization_updateOutputhaha(
+    state, input, output, weight, bias, running_mean, running_var,
+    save_mean, save_std, train, momentum, eps);
+}
+
+void BatchNormalizationP_mean_cuda(
+  THCudaTensor *input, THCudaTensor *save_mean) {
+  THNN_CudaBatchNormalization_mean(
+    state, input, save_mean);
+}
+
+
+void BatchNormalizationP_var_cuda(
+  THCudaTensor *input, THCudaTensor *save_mean, THCudaTensor *save_var) {
+  THNN_CudaBatchNormalization_var(
+    state, input, save_mean, save_var);
+}
+
+
+void BatchNormalizationP_backward_cuda(
+  THCudaTensor *input, THCudaTensor *gradOutput,
+  THCudaTensor *gradOutputMean, THCudaTensor *dotP,
+  THCudaTensor *gradInput,
+  THCudaTensor *gradWeight, THCudaTensor *gradBias, THCudaTensor *weight,
+  THCudaTensor *running_mean, THCudaTensor *running_var,
+  THCudaTensor *save_mean, THCudaTensor *save_std,
+  int train, double scale, double eps) {
+  THNN_CudaBatchNormalization_backwardhaha(
+      state, input, gradOutput, gradOutputMean, dotP,
+      gradInput, gradWeight, gradBias, weight,
+      running_mean, running_var, save_mean, save_std, train, scale, eps);
+}
+
+void BatchNormalizationP_mean_grad_cuda(
+  THCudaTensor *input, THCudaTensor *gradOutput,
+  THCudaTensor *runningMean, THCudaTensor *saveMean,
+  THCudaTensor *gradOutputMean, THCudaTensor *dotP, int train) {
+  THNN_CudaBatchNormalization_mean_grad(
+    state, input, gradOutput, runningMean, saveMean,
+    gradOutputMean, dotP, train);
+}
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.h b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.h
new file mode 100644
index 0000000000..f4b4450390
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda.h
@@ -0,0 +1,30 @@
+void BatchNormalizationP_forward_cuda(
+  THCudaTensor *input, THCudaTensor *output,
+  THCudaTensor *weight, THCudaTensor *bias,
+  THCudaTensor *running_mean, THCudaTensor *running_var,
+  THCudaTensor *save_mean, THCudaTensor *save_std,
+  int train, double momentum, double eps);
+
+
+void BatchNormalizationP_mean_cuda(
+  THCudaTensor *input, THCudaTensor *save_mean);
+
+
+void BatchNormalizationP_var_cuda(
+  THCudaTensor *input, THCudaTensor *save_mean, THCudaTensor *save_var);
+
+
+void BatchNormalizationP_backward_cuda(
+  THCudaTensor *input, THCudaTensor *gradOutput,
+  THCudaTensor *gradOutputMean, THCudaTensor *dotP,
+  THCudaTensor *gradInput,
+  THCudaTensor *gradWeight, THCudaTensor *gradBias, THCudaTensor *weight,
+  THCudaTensor *running_mean, THCudaTensor *running_var,
+  THCudaTensor *save_mean, THCudaTensor *save_std,
+  int train, double scale, double eps);
+
+
+void BatchNormalizationP_mean_grad_cuda(
+  THCudaTensor *input, THCudaTensor *gradOutput,
+  THCudaTensor *runningMean, THCudaTensor *saveMean,
+  THCudaTensor *gradOutputMean, THCudaTensor *dotP, int train);
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.cu b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.cu
new file mode 100644
index 0000000000..521e142408
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.cu
@@ -0,0 +1,363 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+const int WARP_SIZE = 32;
+
+// The maximum number of threads in a block
+const int MAX_BLOCK_SIZE = 512;
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static int getNumThreads(int nElem) {
+  int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
+  for (int i = 0; i != 5; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+
+// Returns the index of the most significant 1 bit in `val`.
+__device__ __forceinline__ int getMSB(int val) {
+  return 31 - __clz(val);
+}
+
+template <typename Dtype, typename Acctype>
+struct Float2 {
+  Acctype v1, v2;
+  __device__ Float2() {}
+  __device__ Float2(Dtype v1, Dtype v2) : v1(ScalarConvert<Dtype, Acctype>::to(v1)), v2(ScalarConvert<Dtype, Acctype>::to(v2)) {}
+  __device__ Float2(Dtype v) : v1(ScalarConvert<Dtype, Acctype>::to(v)), v2(ScalarConvert<Dtype, Acctype>::to(v)) {}
+  __device__ Float2(int v) : v1(ScalarConvert<int, Acctype>::to(v)), v2(ScalarConvert<int, Acctype>::to(v)) {}
+  __device__ Float2& operator+=(const Float2& a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
+struct SumOp {
+  __device__ SumOp(const DeviceTensor3 t) : tensor(t) {}
+  __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+    return ScalarConvert<Dtype, Acctype>::to(tensor[batch][plane][n]);
+  }
+  const DeviceTensor3 tensor;
+};
+
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
+struct VarOp {
+  __device__ VarOp(Acctype m, const DeviceTensor3 t) : mean(m), tensor(t) {}
+  __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+    Dtype val = tensor[batch][plane][n];
+    return (val - mean) * (val - mean);
+  }
+  const Acctype mean;
+  const DeviceTensor3 tensor;
+};
+
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
+struct GradOp {
+  __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g)
+    : mean(m), input(i), gradOutput(g) {}
+  __device__ __forceinline__ Float2<Dtype, Acctype> operator()(int batch, int plane, int n) {
+    Dtype g = gradOutput[batch][plane][n];
+    Dtype c = ScalarConvert<Acctype, Dtype>::to(input[batch][plane][n] - mean);
+    return Float2<Dtype, Acctype>(g, g * c);
+  }
+  const Acctype mean;
+  const DeviceTensor3 input;
+  const DeviceTensor3 gradOutput;
+};
+
+// Sum across all threads within a warp
+template <typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+
+template <typename Dtype, typename Acctype>
+static __device__ __forceinline__ Float2<Dtype, Acctype> warpSum(Float2<Dtype, Acctype> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}
+
+// Sum across (batch, x/y/z) applying Op() pointwise
+template<typename T, typename Op, typename DeviceTensor3>
+__device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
+  T sum = (T)0;
+  for (int batch = 0; batch < tensor.getSize(0); ++batch) {
+    for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationUpdateOutputInference_kernel(
+    const DeviceTensor3 input,
+    DeviceTensor3 output,
+    DeviceTensor1 runningMean,
+    DeviceTensor1 runningVar,
+    const DeviceTensor1 weight,
+    const DeviceTensor1 bias,
+    Acctype epsilon) {
+
+  int plane = blockIdx.x;
+
+  Acctype invstd = Acctype(1) / sqrt(runningVar[plane].ldg() + epsilon);
+  Acctype mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane].ldg());
+  Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane].ldg()) : Acctype(1);
+  Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane].ldg()) : Acctype(0);
+
+  // Write normalized and update the output
+  for (int batch = 0; batch < input.getSize(0); batch++) {
+    for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
+      Dtype inp = input[batch][plane][x].ldg();
+      output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invstd + beta);
+    }
+  }
+}
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationMean_kernel(
+    const DeviceTensor3 input,
+    DeviceTensor1 out_mean) {
+  int plane = blockIdx.x;
+  int N = input.getSize(0) * input.getSize(2);
+
+  Acctype norm = Acctype(1) / N;
+  Acctype mean = reduce<Acctype>(SumOp<Dtype, Acctype, DeviceTensor3>(input), input, plane) * norm;
+  if (threadIdx.x == 0) {
+    out_mean[plane] = ScalarConvert<Acctype, Dtype>::to(mean);
+  }
+}
+
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationVar_kernel(
+    const DeviceTensor3 input,
+    const DeviceTensor1 in_mean,
+    DeviceTensor1 out_var) {
+  int plane = blockIdx.x;
+  int N = input.getSize(0) * input.getSize(2);
+
+  Acctype norm = Acctype(1) / N;
+  Acctype mean = ScalarConvert<Dtype, Acctype>::to(in_mean[plane]);
+
+  Acctype var = reduce<Acctype>(VarOp<Dtype, Acctype, DeviceTensor3>(mean, input), input, plane) * norm;
+  if (threadIdx.x == 0) {
+    out_var[plane] = ScalarConvert<Acctype, Dtype>::to(var);
+  }
+}
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationUpdateOutput_kernelhaha(
+    const DeviceTensor3 input,
+    DeviceTensor3 output,
+    const DeviceTensor1 weight,
+    const DeviceTensor1 bias,
+    const Acctype epsilon,
+    const Acctype momentum,
+    DeviceTensor1 runningMean,
+    DeviceTensor1 runningVar,
+    DeviceTensor1 saveMean,
+    DeviceTensor1 saveVar) {
+
+
+  int plane = blockIdx.x;
+  int N = input.getSize(0) * input.getSize(2);
+
+
+  // Compute the mean and variance across (batch, x/y/z)
+
+  /* Acctype norm = Acctype(1) / N;
+  Acctype mean = reduce<Acctype>(SumOp<Dtype, Acctype, DeviceTensor3>(input), input, plane) * norm;
+  __syncthreads();
+  Acctype varN = reduce<Acctype>(VarOp<Dtype, Acctype, DeviceTensor3>(mean, input), input, plane);
+  Acctype invStd = 0;
+  if (varN != Acctype(0) || epsilon != Acctype(0)) {
+    invStd = 1 / sqrt(varN * norm + epsilon);
+  } */
+
+  Acctype mean = ScalarConvert<Dtype, Acctype>::to(saveMean[plane]);
+   Acctype var = ScalarConvert<Dtype, Acctype>::to(saveVar[plane]);
+   Acctype invStd = 1 / sqrt(var + epsilon);
+
+  // Save the mean, variance, and moving averages
+  if (threadIdx.x == 0) {
+    // Momentum based writeback
+    // Acctype unbiasedVar = varN / (N - 1);
+    Acctype unbiasedVar = var * N / (N - 1);
+    // saveMean[plane] = ScalarConvert<Acctype, Dtype>::to(mean);
+    // saveStd[plane] = ScalarConvert<Acctype, Dtype>::to(invStd);
+    runningMean[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningMean[plane] + momentum * mean);
+    runningVar[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningVar[plane] + momentum * unbiasedVar);
+  }
+
+  // Write normalized and update the output
+  Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : ScalarConvert<int, Acctype>::to(1);
+  Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane]) : ScalarConvert<int, Acctype>::to(0);
+  for (int batch = 0; batch < input.getSize(0); ++batch) {
+    for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
+      Dtype inp = input[batch][plane][x].ldg();
+      output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invStd + beta);
+    }
+  }
+}
+
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationMeanGrad_kernel(
+    const DeviceTensor3 input,
+    const DeviceTensor3 gradOutput,
+    const DeviceTensor1 runningMean,
+    const DeviceTensor1 saveMean,
+    DeviceTensor1 gradOutputMean_all,
+    DeviceTensor1 dotP_all,
+    bool train) {
+  int plane = blockIdx.x;
+  int N = gradOutput.getSize(0) * gradOutput.getSize(2);
+
+  Acctype mean;
+  if (train) {
+    mean = ScalarConvert<Dtype, Acctype>::to(saveMean[plane]);
+  } else {
+    mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane]);
+  }
+
+  Acctype norm = Acctype(1) / N;
+  GradOp<Dtype, Acctype, DeviceTensor3> g(mean, input, gradOutput);
+  Float2<Dtype, Acctype> res = reduce<Float2<Dtype, Acctype>, GradOp<Dtype, Acctype, DeviceTensor3>, DeviceTensor3>(g, gradOutput, plane);
+  Acctype gradOutputMean = res.v1 * norm;
+  Acctype dotP = res.v2 * norm;
+
+  if (threadIdx.x == 0) {
+    gradOutputMean_all[plane] = ScalarConvert<Acctype, Dtype>::to(gradOutputMean);
+    dotP_all[plane] = ScalarConvert<Acctype, Dtype>::to(dotP);
+  }
+}
+
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
+__global__ void BatchNormalizationBackward_kernel(
+    const DeviceTensor3 input,
+    const DeviceTensor3 gradOutput,
+    const DeviceTensor1 gradOutputMean,
+    const DeviceTensor1 dotP_all,
+    DeviceTensor3 gradInput,
+    DeviceTensor1 gradWeight,
+    DeviceTensor1 gradBias,
+    const DeviceTensor1 weight,
+    const DeviceTensor1 runningMean,
+    const DeviceTensor1 runningVar,
+    const DeviceTensor1 saveMean,
+    const DeviceTensor1 saveVar,
+    bool train,
+    Acctype scale,
+    double eps) {
+
+  int plane = blockIdx.x;
+  int N = gradOutput.getSize(0) * gradOutput.getSize(2);
+
+  Acctype mean, stdVal;
+  if (train) {
+    mean = ScalarConvert<Dtype, Acctype>::to(saveMean[plane]);
+    stdVal = 1 / sqrt(ScalarConvert<Dtype, Acctype>::to(saveVar[plane]) + eps);
+  } else {
+    mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane]);
+    stdVal = 1 / sqrt(runningVar[plane] + eps);
+  }
+
+  Acctype weightVal = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : Acctype(1);
+  // Acctype norm = Acctype(1) / N;
+
+  // Compute two values across (batch, x/y/z) in one pass:
+  // 1. Sum(gradOutput)
+  // 2. DotProduct(input - mean, gradOutput)
+  // GradOp<Dtype, Acctype, DeviceTensor3> g(mean, input, gradOutput);
+  // Float2<Dtype, Acctype> res = reduce<Float2<Dtype, Acctype>, GradOp<Dtype, Acctype, DeviceTensor3>, DeviceTensor3>(g, gradOutput, plane);
+  // Acctype gradOutputSum = res.v1;
+  Acctype gradOutputSum = ScalarConvert<Dtype, Acctype>::to(gradOutputMean[plane]) * N;
+  // Acctype dotP = res.v2;
+  Acctype dotP = ScalarConvert<Dtype, Acctype>::to(dotP_all[plane]);
+
+  // Acctype gradMean = gradOutputSum * norm;
+  Acctype gradMean = ScalarConvert<Dtype, Acctype>::to(gradOutputMean[plane]);
+  // Acctype projScale = dotP * norm * stdVal * stdVal;
+  Acctype projScale = dotP * stdVal * stdVal;
+  Acctype gradScale = stdVal * weightVal;
+
+  if (gradInput.numElements() > 0) {
+    for (int batch = 0; batch < gradOutput.getSize(0); ++batch) {
+      for (int x = threadIdx.x; x < gradOutput.getSize(2); x += blockDim.x) {
+        Dtype gradOut = gradOutput[batch][plane][x];
+        if (train) {
+          Dtype inp = input[batch][plane][x];
+          Acctype proj = (inp - mean) * projScale;
+          gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to((gradOut - proj - gradMean) * gradScale);
+        } else {
+          gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gradOut * gradScale);
+        }
+      }
+    }
+  }
+
+  if (gradWeight.numElements() > 0) {
+    if (threadIdx.x == 0) {
+      gradWeight[plane] += ScalarConvert<Acctype, Dtype>::to(scale * dotP * stdVal);
+    }
+  }
+
+  if (gradBias.numElements() > 0) {
+    if (threadIdx.x == 0) {
+      gradBias[plane] += ScalarConvert<Acctype, Dtype>::to(scale * gradOutputSum);
+    }
+  }
+}
+
+#include "generic/batchnormp_cuda.cu"
+#include "THCGenerateFloatTypes.h"
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.h b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.h
new file mode 100644
index 0000000000..a03df3c33a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/batchnormp_cuda_kernel.h
@@ -0,0 +1,16 @@
+#include <THC/THCTensor.h>
+
+void THNN_CudaBatchNormalization_updateOutputhaha(
+  THCState *state, THCudaTensor *input_, THCudaTensor *output_,
+  THCudaTensor *weight_, THCudaTensor *bias_, THCudaTensor *runningMean_,
+  THCudaTensor *runningVar_, THCudaTensor *saveMean_, THCudaTensor *saveStd_,
+  int train, double momentum, double eps);
+
+
+void THNN_CudaBatchNormalization_backwardhaha(
+  THCState *state, THCudaTensor *input_, THCudaTensor *gradOutput_,
+  THCudaTensor *gradOutputMean_, THCudaTensor *dotP,
+  THCudaTensor *gradInput_, THCudaTensor *gradWeight_, THCudaTensor *gradBias_,
+  THCudaTensor *weight_, THCudaTensor *runningMean_, THCudaTensor *runningVar_,
+  THCudaTensor *saveMean_, THCudaTensor *saveStd_, int train, double scale,
+  double eps);
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/generic/batchnormp_cuda.cu b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/generic/batchnormp_cuda.cu
new file mode 100644
index 0000000000..4a7ad9d499
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/src/generic/batchnormp_cuda.cu
@@ -0,0 +1,185 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/batchnormp_cuda.cu"
+#else
+
+#define DeviceTensor3 THCDeviceTensor<real, 3>
+#define DeviceTensor1 THCDeviceTensor<real, 1>
+
+template <int Dim>
+static THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
+  if (!t) {
+    return THCDeviceTensor<real, Dim>();
+  }
+
+  int inDim = THCTensor_(nDimension)(state, t);
+  if (inDim == Dim) {
+    return toDeviceTensor<real, Dim>(state, t);
+  }
+
+  // View in which the last dimensions are collapsed or expanded as needed
+  THAssert(THCTensor_(isContiguous)(state, t));
+  int size[Dim];
+  for (int i = 0; i < Dim || i < inDim; ++i) {
+    if (i < Dim && i < inDim) {
+      size[i] = t->size[i];
+    } else if (i < Dim) {
+      size[i] = 1;
+    } else {
+      size[Dim - 1] *= t->size[i];
+    }
+  }
+  return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
+}
+
+extern "C" void THNN_(BatchNormalization_updateOutputhaha)(
+  THCState *state, THCTensor *input_, THCTensor *output_,
+  THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_,
+  THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_,
+  int train, double momentum, double eps);
+
+extern "C" void THNN_(BatchNormalization_mean)(
+  THCState *state, THCTensor *input_, THCTensor *saveMean_);
+
+extern "C" void THNN_(BatchNormalization_var)(
+  THCState *state, THCTensor *input_, THCTensor *saveMean_,
+  THCTensor *saveVar_);
+
+
+void THNN_(BatchNormalization_mean)(
+  THCState *state, THCTensor *input_, THCTensor *saveMean_) {
+  DeviceTensor3 input = devicetensor<3>(state, input_);
+  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+  cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
+
+  dim3 blocks(input.getSize(1));
+  dim3 threads(getNumThreads(input.getSize(2)));
+  BatchNormalizationMean_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+  input, saveMean);
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(BatchNormalization_var)(
+  THCState *state, THCTensor *input_, THCTensor *saveMean_, THCTensor *saveVar_) {
+  DeviceTensor3 input = devicetensor<3>(state, input_);
+  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+  DeviceTensor1 saveVar = devicetensor<1>(state, saveVar_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+  cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
+
+  dim3 blocks(input.getSize(1));
+  dim3 threads(getNumThreads(input.getSize(2)));
+  BatchNormalizationVar_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+  input, saveMean, saveVar);
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(BatchNormalization_updateOutputhaha)(
+  THCState *state, THCTensor *input_, THCTensor *output_,
+  THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_,
+  THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_,
+  int train, double momentum, double eps) {
+
+  THCTensor_(resizeAs)(state, output_, input_);
+  DeviceTensor3 input = devicetensor<3>(state, input_);
+  DeviceTensor3 output = devicetensor<3>(state, output_);
+  DeviceTensor1 weight = devicetensor<1>(state, weight_);
+  DeviceTensor1 bias = devicetensor<1>(state, bias_);
+  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
+  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
+  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+  cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
+
+  if (!train) {
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    BatchNormalizationUpdateOutputInference_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+      input, output, runningMean, runningVar, weight, bias, eps);
+  } else {
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    BatchNormalizationUpdateOutput_kernelhaha<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+      input, output, weight, bias, eps, momentum, runningMean, runningVar,
+      saveMean, saveStd);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+extern "C" void THNN_(BatchNormalization_backwardhaha)(
+  THCState *state, THCTensor *input_, THCTensor *gradOutput_,
+  THCTensor *gradOutputMean_, THCTensor *dotP,
+  THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_,
+  THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_,
+  THCTensor *saveMean_, THCTensor *saveStd_, int train, double scale, double eps);
+
+
+extern "C" void THNN_(BatchNormalization_mean_grad)(
+  THCState *state, THCTensor *input_, THCTensor *gradOutput_,
+  THCTensor *runningMean_, THCTensor *saveMean_,
+  THCTensor *gradOutputMean_, THCTensor *dotP_, int train);
+
+
+void THNN_(BatchNormalization_mean_grad)(
+  THCState *state, THCTensor *input_, THCTensor *gradOutput_,
+  THCTensor *runningMean_, THCTensor *saveMean_,
+  THCTensor *gradOutputMean_, THCTensor *dotP_, int train) {
+
+  THCUNN_check_shape(state, input_, gradOutput_);
+  DeviceTensor3 input = devicetensor<3>(state, input_);
+  DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
+  DeviceTensor1 gradOutputMean = devicetensor<1>(state, gradOutputMean_);
+  DeviceTensor1 dotP = devicetensor<1>(state, dotP_);
+
+  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
+  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+
+  dim3 blocks(gradOutput.getSize(1));
+  dim3 threads(getNumThreads(gradOutput.getSize(2)));
+  BatchNormalizationMeanGrad_kernel<real,  accreal,  DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+    input, gradOutput, runningMean, saveMean, gradOutputMean, dotP, train);
+  THCudaCheck(cudaGetLastError());
+}
+
+
+void THNN_(BatchNormalization_backwardhaha)(
+  THCState *state, THCTensor *input_, THCTensor *gradOutput_,
+  THCTensor *gradOutputMean_, THCTensor *dotP_,
+  THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_,
+  THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_,
+  THCTensor *saveMean_, THCTensor *saveStd_, int train, double scale, double eps) {
+
+  THCUNN_check_shape(state, input_, gradOutput_);
+  DeviceTensor3 input = devicetensor<3>(state, input_);
+  DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
+  DeviceTensor1 gradOutputMean = devicetensor<1>(state, gradOutputMean_);
+  DeviceTensor1 dotP = devicetensor<1>(state, dotP_);
+  DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_);
+  DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_);
+  DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_);
+  DeviceTensor1 weight = devicetensor<1>(state, weight_);
+  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
+  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
+  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+
+  dim3 blocks(gradOutput.getSize(1));
+  dim3 threads(getNumThreads(gradOutput.getSize(2)));
+  BatchNormalizationBackward_kernel<real,  accreal,  DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+    input, gradOutput, gradOutputMean, dotP, gradInput, gradWeight, gradBias, weight, runningMean, runningVar,
+    saveMean, saveStd, train, scale, eps);
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef DeviceTensor3
+#undef DeviceTensor1
+
+#endif
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py
new file mode 100644
index 0000000000..9b74bc821b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py
@@ -0,0 +1,54 @@
+import pdb
+import time
+import logging
+
+import torch
+from torch.autograd import Variable
+from torch.autograd import gradcheck
+
+from modules import batchnormsync
+
+FORMAT = "[%(asctime)-15s %(filename)s:%(lineno)d %(funcName)s] %(message)s"
+logging.basicConfig(format=FORMAT)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+batchnormsync.BatchNormSync.checking_mode = True
+batchnormsync.BatchNormSync.sync = True
+
+cuda = True
+batch_size = 3
+input = torch.randn(3, 3, 2, 2).float()
+# input = torch.Tensor(range(60 * batch_size)).float().resize_(batch_size, 3, 2, 2) / 100
+bn = batchnormsync.BatchNormSync(3, eps=0, affine=True,
+                                 device_ids=None)
+bn2 = torch.nn.BatchNorm2d(3, eps=0, affine=False)
+# bn.train()
+
+bn1 = batchnormsync.BatchNormSync(3, eps=0, affine=True, device_ids=[0])
+
+bn1.train()
+
+if cuda:
+    bn = torch.nn.DataParallel(bn)
+    bn2 = torch.nn.DataParallel(bn2)
+
+    bn = bn.cuda()
+    bn1 = bn1.cuda()
+    bn2 = bn2.cuda()
+    input = input.cuda()
+
+
+inputs = (Variable(input, requires_grad=True),)
+# output = bn(inputs[0])
+
+# output1 = bn1(inputs[0])
+# output2 = bn2(inputs[0])
+# print((output1 - output2).abs().max())
+# print((output - output2).abs().max())
+# test = gradcheck(bn, inputs, eps=1e-4, atol=1e-4, rtol=1e-8)
+for i in range(1000):
+    logger.info(i)
+    start_time = time.time()
+    test = gradcheck(bn, inputs, eps=1e-4, atol=1e-2, rtol=1e-3)
+    logger.info('%s %f', test, time.time() - start_time)
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/requirements.txt b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/requirements.txt
new file mode 100644
index 0000000000..6971bd0b5a
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/requirements.txt
@@ -0,0 +1,6 @@
+torch==1.5.0
+apex
+torchvision==0.5.0
+onnx
+numpy
+pillow
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
new file mode 100644
index 0000000000..88d5dc94d7
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
@@ -0,0 +1,1036 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import random
+import argparse
+import json
+import logging
+import math
+import os
+from os.path import exists, join, split
+import threading
+import torch.multiprocessing as mp
+import time
+import apex
+from apex import amp
+import torch.distributed as dist
+import numpy as np
+import shutil
+
+import sys
+from PIL import Image
+import torch
+if torch.__version__>= '1.8':
+    import torch_npu
+from torch import nn
+import torch.backends.cudnn as cudnn
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.autograd import Variable
+
+import drn
+import data_transforms as transforms
+
+try:
+    from modules import batchnormsync
+except ImportError:
+    pass
+
+FORMAT = "[%(asctime)-15s %(filename)s:%(lineno)d %(funcName)s] %(message)s"
+logging.basicConfig(format=FORMAT)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+CITYSCAPE_PALETTE = np.asarray([
+    [128, 64, 128],
+    [244, 35, 232],
+    [70, 70, 70],
+    [102, 102, 156],
+    [190, 153, 153],
+    [153, 153, 153],
+    [250, 170, 30],
+    [220, 220, 0],
+    [107, 142, 35],
+    [152, 251, 152],
+    [70, 130, 180],
+    [220, 20, 60],
+    [255, 0, 0],
+    [0, 0, 142],
+    [0, 0, 70],
+    [0, 60, 100],
+    [0, 80, 100],
+    [0, 0, 230],
+    [119, 11, 32],
+    [0, 0, 0]], dtype=np.uint8)
+
+
+TRIPLET_PALETTE = np.asarray([
+    [0, 0, 0, 255],
+    [217, 83, 79, 255],
+    [91, 192, 222, 255]], dtype=np.uint8)
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class DRNSeg(nn.Module):
+    def __init__(self, model_name, classes, pretrained_model=None,
+                 pretrained=True, use_torch_up=False):
+        super(DRNSeg, self).__init__()
+        model = drn.__dict__.get(model_name)(
+            pretrained=pretrained, num_classes=1000)
+        pmodel = nn.DataParallel(model)
+        if pretrained_model is not None:
+            pmodel.load_state_dict(pretrained_model)
+        self.base = nn.Sequential(*list(model.children())[:-2])
+
+        self.seg = nn.Conv2d(model.out_dim, classes,
+                             kernel_size=1, bias=True)
+        self.softmax = nn.LogSoftmax()
+        m = self.seg
+        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(0, math.sqrt(2. / n))
+        m.bias.data.zero_()
+        if use_torch_up:
+            self.up = nn.UpsamplingBilinear2d(scale_factor=8)
+        else:
+            up = nn.ConvTranspose2d(classes, classes, 16, stride=8, padding=4,
+                                    output_padding=0, groups=classes,
+                                    bias=False)
+            fill_up_weights(up)
+            up.weight.requires_grad = False
+            self.up = up
+
+    def forward(self, x):
+        x = self.base(x)
+        x = self.seg(x)
+        y = self.up(x)
+        return self.softmax(y), x
+
+    def optim_parameters(self, memo=None):
+        for param in self.base.parameters():
+            yield param
+        for param in self.seg.parameters():
+            yield param
+
+
+class SegList(torch.utils.data.Dataset):
+    def __init__(self, data_dir, phase, transforms, list_dir=None,
+                 out_name=False):
+        self.list_dir = data_dir if list_dir is None else list_dir
+        self.data_dir = data_dir
+        self.out_name = out_name
+        self.phase = phase
+        self.transforms = transforms
+        self.image_list = None
+        self.label_list = None
+        self.bbox_list = None
+        self.read_lists()
+
+    def __getitem__(self, index):
+        data = [Image.open(join(self.data_dir, self.image_list[index]))]
+        if self.label_list is not None:
+            data.append(Image.open(
+                join(self.data_dir, self.label_list[index])))
+        data = list(self.transforms(*data))
+        if self.out_name:
+            if self.label_list is None:
+                data.append(data[0][0, :, :])
+            data.append(self.image_list[index])
+        return tuple(data)
+
+    def __len__(self):
+        return len(self.image_list)
+
+    def read_lists(self):
+        image_path = join(self.list_dir, self.phase + '_images.txt')
+        label_path = join(self.list_dir, self.phase + '_labels.txt')
+        assert exists(image_path)
+        self.image_list = [line.strip() for line in open(image_path, 'r')]
+        if exists(label_path):
+            self.label_list = [line.strip() for line in open(label_path, 'r')]
+            assert len(self.image_list) == len(self.label_list)
+
+
+class SegListMS(torch.utils.data.Dataset):
+    def __init__(self, data_dir, phase, transforms, scales, list_dir=None):
+        self.list_dir = data_dir if list_dir is None else list_dir
+        self.data_dir = data_dir
+        self.phase = phase
+        self.transforms = transforms
+        self.image_list = None
+        self.label_list = None
+        self.bbox_list = None
+        self.read_lists()
+        self.scales = scales
+
+    def __getitem__(self, index):
+        data = [Image.open(join(self.data_dir, self.image_list[index]))]
+        w, h = data[0].size
+        if self.label_list is not None:
+            data.append(Image.open(
+                join(self.data_dir, self.label_list[index])))
+        # data = list(self.transforms(*data))
+        out_data = list(self.transforms(*data))
+        ms_images = [self.transforms(data[0].resize((int(w * s), int(h * s)),
+                                                    Image.BICUBIC))[0]
+                     for s in self.scales]
+        out_data.append(self.image_list[index])
+        out_data.extend(ms_images)
+        return tuple(out_data)
+
+    def __len__(self):
+        return len(self.image_list)
+
+    def read_lists(self):
+        image_path = join(self.list_dir, self.phase + '_images.txt')
+        label_path = join(self.list_dir, self.phase + '_labels.txt')
+        assert exists(image_path)
+        self.image_list = [line.strip() for line in open(image_path, 'r')]
+        if exists(label_path):
+            self.label_list = [line.strip() for line in open(label_path, 'r')]
+            assert len(self.image_list) == len(self.label_list)
+
+
+def validate(val_loader, model, criterion, args, ngpus_per_node, eval_score=None, print_freq=10):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    score = AverageMeter()
+
+    # switch to evaluate mode
+    model.eval()
+    
+    with torch.no_grad():
+        end = time.time()
+        for i, (input, target) in enumerate(val_loader):
+            if type(criterion) in [torch.nn.modules.loss.L1Loss,
+                                   torch.nn.modules.loss.MSELoss]:
+                target = target.float()
+
+            if args.gpu is not None:
+                if args.device == 'npu':
+                    loc = 'npu:{}'.format(args.gpu)
+                    input = input.to(loc).to(torch.float)
+                else:
+                    input = input.cuda(args.gpu, non_blocking=True)
+            if args.device == 'npu':
+                loc = 'npu:{}'.format(args.gpu)
+                target = target.to(torch.int32).to(loc, non_blocking=True)
+            else:
+                target = target.cuda(args.gpu, non_blocking=True)
+
+            input_var = torch.autograd.Variable(input)
+            target_var = torch.autograd.Variable(target)
+
+            # compute output
+            output = model(input_var)[0]
+            loss = criterion(output, target_var)
+
+            # measure accuracy and record loss
+            # prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            if eval_score is not None:
+                score.update(eval_score(output, target_var), input.size(0))
+
+            # measure elapsed time
+            if i > 5:
+                batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % print_freq == 0:
+                if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                                and args.rank % ngpus_per_node == 0):
+                    logger.info('Test: [{0}/{1}]\t'
+                                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                                'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
+                        i, len(val_loader), batch_time=batch_time, loss=losses))
+                    
+
+        # logger.info(' * Score {top1.avg:.3f}'.format(top1=score))
+
+    return score.avg
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target):
+    """Computes the precision@k for the specified values of k"""
+    # batch_size = target.size(0) * target.size(1) * target.size(2)
+    _, pred = output.max(1)
+    pred = pred.view(1, -1)
+    target = target.view(1, -1)
+    correct = pred.eq(target)
+    correct = correct[target != 255]
+    correct = correct.view(-1)
+    score = correct.float().sum(0).mul(100.0 / correct.size(0))
+    return score.data[0]
+
+
+def train(train_loader, model, criterion, optimizer, epoch,
+          args, ngpus_per_node, eval_score=None, print_freq=10):
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    scores = AverageMeter()
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+        if type(criterion) in [torch.nn.modules.loss.L1Loss,
+                               torch.nn.modules.loss.MSELoss]:
+            target = target.float()
+
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            input = input.to(loc, non_blocking=True).to(torch.float)
+            target = target.to(torch.int32).to(loc, non_blocking=True)
+        else:
+            input = input.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)  
+
+        input_var = torch.autograd.Variable(input)
+        target_var = torch.autograd.Variable(target)
+        # compute output
+        output = model(input_var)[0]
+        loss = criterion(output, target_var)
+
+        # measure accuracy and record loss
+        # prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+        losses.update(loss.item(), input.size(0))
+        if eval_score is not None:
+            scores.update(eval_score(output, target_var), input.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()        
+        optimizer.step()
+        # measure elapsed time
+
+        if args.device == 'npu':
+            torch.npu.synchronize()      
+        if i > 5:
+            batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % print_freq == 0:
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                        and args.rank % ngpus_per_node == 0): 
+                logger.info('Epoch: [{0}][{1}/{2}]\t'
+                            'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                            'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                            'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
+                    epoch, i, len(train_loader), batch_time=batch_time,
+                    data_time=data_time, loss=losses))
+
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                and args.rank % ngpus_per_node == 0):
+        if batch_time.avg:
+            print("[npu id:", args.gpu, "]", "batch_size:",  args.batch_size*args.world_size,
+                  'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format(
+                    args.batch_size * args.world_size / batch_time.avg))
+
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+def train_seg(gpu, ngpus_per_node, args):
+    num_workers = args.workers
+    batch_size = args.batch_size
+    crop_size = args.crop_size
+
+    print(' '.join(sys.argv))
+
+    for k, v in args.__dict__.items():
+        print(k, ':', v)
+
+    args.gpu = args.process_device_map[gpu]
+    
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            args.rank = args.rank * ngpus_per_node + gpu
+
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend,  # init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+    
+    single_model = DRNSeg(args.arch, args.classes, None, pretrained=True)
+    if args.pretrained:
+        # single_model.load_state_dict(torch.load(args.pretrained)["state_dict"])
+        pretrained_dict = torch.load(args.pretrained, map_location="cpu")["state_dict"]
+        if "fc.weight" in pretrained_dict:
+            print("pop fc layer weight")
+            pretrained_dict.pop('fc.weight')
+            pretrained_dict.pop('fc.bias')
+        single_model.load_state_dict(pretrained_dict, strict=False)
+        model = single_model
+    else:
+        model = single_model
+
+    if args.distributed:
+        if args.gpu is not None:
+            if args.device == 'npu':
+                loc = 'npu:{}'.format(args.gpu)
+                torch.npu.set_device(loc)
+                model = model.to(loc)
+            else:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+
+            args.batch_size = int(args.batch_size / args.world_size)
+            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+        else:
+            if args.device == 'npu':
+                loc = 'npu:{}'.format(args.gpu)
+                model = model.to(loc)
+            else:
+                model.cuda()
+
+    elif args.gpu is not None:
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            torch.npu.set_device(args.gpu)
+            model = model.to(loc)
+        else:
+            torch.cuda.set_device(args.gpu)
+            model = model.cuda(args.gpu)
+
+    else:
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+        else:
+            print("before : model = torch.nn.DataParallel(model).cuda()")
+     
+    # define loss function (criterion) and pptimizer
+    optimizer = torch.optim.SGD(single_model.optim_parameters(),
+                                args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    if args.amp:
+        model, optimizer = amp.initialize(
+            model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
+
+    if args.distributed:
+        if args.gpu is not None:
+            if args.pretrained:
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False,
+                                                                  find_unused_parameters=True)
+            else:
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+        else:
+            model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None:
+        print("[gpu id:", args.gpu, "]",
+              "============================test   elif args.gpu is not None:==========================")
+    else:
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            model = torch.nn.DataParallel(model).to(loc)
+        else:
+            model = torch.nn.DataParallel(model).cuda()
+           
+    if args.device == 'npu':
+        loc = 'npu:{}'.format(args.gpu)
+        # criterion = nn.NLLLoss(ignore_index=255).to(loc)
+        criterion = nn.NLLLoss().to(loc)
+    else:
+        criterion = nn.NLLLoss(ignore_index=255).cuda(args.gpu)
+
+    # Data loading code
+    data_dir = args.data_dir
+    info = json.load(open(join(data_dir, 'info.json'), 'r'))
+    normalize = transforms.Normalize(mean=info['mean'],
+                                     std=info['std'])
+    t = []
+    if args.random_rotate > 0:
+        t.append(transforms.RandomRotate(args.random_rotate))
+    if args.random_scale > 0:
+        t.append(transforms.RandomScale(args.random_scale))
+    t.extend([transforms.RandomCrop(args.crop_size),
+              transforms.RandomHorizontalFlip(),
+              transforms.ToTensor(),
+              normalize])
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(
+            SegList(data_dir, 'train', transforms.Compose(t),list_dir=args.list_dir))
+    else:
+        train_sampler = None
+         
+    train_loader = torch.utils.data.DataLoader(
+        SegList(data_dir, 'train', transforms.Compose(t),
+                list_dir=args.list_dir),
+        batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers,
+        pin_memory=True, drop_last=True, sampler=train_sampler
+    )
+    val_loader = torch.utils.data.DataLoader(
+        SegList(data_dir, 'val', transforms.Compose([
+            transforms.RandomCrop(args.crop_size),
+            transforms.ToTensor(),
+            normalize,
+        ]), list_dir=args.list_dir),
+        batch_size=args.batch_size, shuffle=False, num_workers=args.workers,
+        pin_memory=True, drop_last=True,
+    )
+       
+    cudnn.benchmark = True
+    best_prec1 = 0
+    start_epoch = 0
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            else:
+                # Map model to be loaded to specified single gpu.
+                if args.device == 'npu':
+                    loc = 'npu:{}'.format(args.gpu)
+                else:
+                    loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            start_epoch = checkpoint['epoch']
+            best_prec1 = checkpoint['best_prec1']
+            if args.gpu is not None:
+                # best_prec1 may be from a checkpoint from a different GPU
+                best_prec1 = best_prec1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    if args.evaluate:
+        # validate(val_loader, model, criterion, args, ngpus_per_node, eval_score=accuracy)
+        validate(val_loader, model, criterion, args, ngpus_per_node)
+        return
+        
+    if args.prof:
+        profiling(train_loader, model, criterion, optimizer, args)
+        return    
+    
+    start_time = time.time()
+    for epoch in range(start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+           
+        lr = adjust_learning_rate(args, optimizer, epoch)
+        # train for one epoch
+        # train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node, eval_score=accuracy)
+        train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node)
+
+        # evaluate on validation set
+        # prec1 = validate(val_loader, model, criterion, args, ngpus_per_node, eval_score=accuracy)
+        prec1 = validate(val_loader, model, criterion, args, ngpus_per_node)
+
+        is_best = prec1 > best_prec1
+        best_prec1 = max(prec1, best_prec1)
+
+        if args.device == 'npu' and args.gpu == 0 and epoch == args.epochs - 1:
+            print("Complete 250 epoch training, take time:{}h".format(round((time.time() - start_time) / 3600.0, 2)))   
+          
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):        
+            checkpoint_path = os.path.join(args.save_path, 'checkpoint_latest.pth.tar')
+            if args.amp:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'arch': 'args.arch',
+                    'state_dict': model.state_dict(),
+                    'best_prec1': best_prec1,
+                    'optimizer': optimizer.state_dict(),
+                    'amp': amp.state_dict(),
+                }, is_best, filename=checkpoint_path)
+            else:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'arch': 'args.arch',
+                    'state_dict': model.state_dict(),
+                    'best_prec1': best_prec1,
+                    'optimizer': optimizer.state_dict(),
+                }, is_best, filename=checkpoint_path)
+            if (epoch + 1) % args.save_iter == 0:
+                history_path = os.path.join(args.save_path, 'checkpoint_{:03d}.pth.tar'.format(epoch + 1))
+                shutil.copyfile(checkpoint_path, history_path)
+
+
+def profiling(data_loader, model, criterion, optimizer, args):
+    # switch to train mode
+    model.train()
+
+    def update(model, input, target, optimizer):
+        output = model(input)[0]
+        loss = criterion(output, target)
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        optimizer.zero_grad()
+        optimizer.step()
+
+    for step, (input, target) in enumerate(data_loader):
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(args.gpu)
+            # input = input.to(loc, non_blocking=True).to(torch.float)
+            input = input.to(loc, non_blocking=True).to()
+            target = target.to(torch.int32).to(loc, non_blocking=True)
+        else:
+            input = input.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)
+            
+        if step < 5:
+            update(model, input, target, optimizer)
+        else:
+            if args.device == 'npu':
+                with torch.autograd.profiler.profile(use_npu=True) as prof:
+                    update(model, input, target, optimizer)
+            else:
+                with torch.autograd.profiler.profile(use_cuda=True) as prof:
+                    update(model, input, target, optimizer)
+            break
+
+    prof.export_chrome_trace("output.prof")
+
+
+def adjust_learning_rate(args, optimizer, epoch):
+    """
+    Sets the learning rate to the initial LR decayed by 10 every 30 epochs
+    """
+    if args.lr_mode == 'step':
+        lr = args.lr * (0.1 ** (epoch // args.step))
+    elif args.lr_mode == 'poly':
+        lr = args.lr * (1 - epoch / args.epochs) ** 0.9
+    else:
+        raise ValueError('Unknown lr mode {}'.format(args.lr_mode))
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+
+def fast_hist(pred, label, n):
+    k = (label >= 0) & (label < n)
+    return np.bincount(
+        n * label[k].astype(int) + pred[k], minlength=n ** 2).reshape(n, n)
+
+
+def per_class_iu(hist):
+    return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
+
+
+def save_output_images(predictions, filenames, output_dir):
+    """
+    Saves a given (B x C x H x W) into an image file.
+    If given a mini-batch tensor, will save the tensor as a grid of images.
+    """
+    # pdb.set_trace()
+    for ind in range(len(filenames)):
+        im = Image.fromarray(predictions[ind].astype(np.uint8))
+        fn = os.path.join(output_dir, filenames[ind][:-4] + '.png')
+        out_dir = split(fn)[0]
+        if not exists(out_dir):
+            os.makedirs(out_dir)
+        im.save(fn)
+
+
+def save_colorful_images(predictions, filenames, output_dir, palettes):
+   """
+   Saves a given (B x C x H x W) into an image file.
+   If given a mini-batch tensor, will save the tensor as a grid of images.
+   """
+   for ind in range(len(filenames)):
+       im = Image.fromarray(palettes[predictions[ind].squeeze()])
+       fn = os.path.join(output_dir, filenames[ind][:-4] + '.png')
+       out_dir = split(fn)[0]
+       if not exists(out_dir):
+           os.makedirs(out_dir)
+       im.save(fn)
+
+
+def test(eval_data_loader, model, num_classes,
+         output_dir='pred', has_gt=True, save_vis=False):
+    model.eval()
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    end = time.time()
+    hist = np.zeros((num_classes, num_classes))
+    for iter, (image, label, name) in enumerate(eval_data_loader):
+        data_time.update(time.time() - end) 
+        image_var = Variable(image, requires_grad=False, volatile=True).npu()
+        final = model(image_var)[0]
+        _, pred = torch.max(final, 1)
+        pred = pred.cpu().data.numpy()
+        batch_time.update(time.time() - end)
+        if save_vis:
+            save_output_images(pred, name, output_dir)
+            save_colorful_images(
+                pred, name, output_dir + '_color',
+                TRIPLET_PALETTE if num_classes == 3 else CITYSCAPE_PALETTE)
+        if has_gt:
+            label = label.numpy()
+            hist += fast_hist(pred.flatten(), label.flatten(), num_classes)
+            logger.info('===> mAP {mAP:.3f}'.format(
+                mAP=round(np.nanmean(per_class_iu(hist)) * 100, 2)))
+        end = time.time()
+        logger.info('Eval: [{0}/{1}]\t'
+                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                    .format(iter, len(eval_data_loader), batch_time=batch_time,
+                            data_time=data_time))
+    if has_gt: #val
+        ious = per_class_iu(hist) * 100
+        logger.info(' '.join('{:.03f}'.format(i) for i in ious))
+        return round(np.nanmean(ious), 2)
+
+
+def resize_4d_tensor(tensor, width, height):
+    tensor_cpu = tensor.cpu().numpy()
+    if tensor.size(2) == height and tensor.size(3) == width:
+        return tensor_cpu
+    out_size = (tensor.size(0), tensor.size(1), height, width)
+    out = np.empty(out_size, dtype=np.float32)
+
+    def resize_one(i, j):
+        out[i, j] = np.array(
+            Image.fromarray(tensor_cpu[i, j]).resize(
+                (width, height), Image.BILINEAR))
+
+    def resize_channel(j):
+        for i in range(tensor.size(0)):
+            out[i, j] = np.array(
+                Image.fromarray(tensor_cpu[i, j]).resize(
+                    (width, height), Image.BILINEAR))
+
+    # workers = [threading.Thread(target=resize_one, args=(i, j))
+    #            for i in range(tensor.size(0)) for j in range(tensor.size(1))]
+
+    workers = [threading.Thread(target=resize_channel, args=(j,))
+               for j in range(tensor.size(1))]
+    for w in workers:
+        w.start()
+    for w in workers:
+        w.join()
+    # for i in range(tensor.size(0)):
+    #     for j in range(tensor.size(1)):
+    #         out[i, j] = np.array(
+    #             Image.fromarray(tensor_cpu[i, j]).resize(
+    #                 (w, h), Image.BILINEAR))
+    # out = tensor.new().resize_(*out.shape).copy_(torch.from_numpy(out))
+    return out
+
+
+def test_ms(eval_data_loader, model, num_classes, scales,
+            output_dir='pred', has_gt=True, save_vis=False):
+    model.eval()
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    end = time.time()
+    hist = np.zeros((num_classes, num_classes))
+    num_scales = len(scales)
+    for iter, input_data in enumerate(eval_data_loader):
+        data_time.update(time.time() - end)
+        if has_gt:
+            name = input_data[2]
+            label = input_data[1]
+        else:
+            name = input_data[1]
+        h, w = input_data[0].size()[2:4]
+        images = [input_data[0]]
+        images.extend(input_data[-num_scales:])
+        # pdb.set_trace()
+        outputs = []
+        for image in images:
+            image_var = Variable(image, requires_grad=False, volatile=True).npu()
+            final = model(image_var)[0]
+            outputs.append(final.data)
+        final = sum([resize_4d_tensor(out, w, h) for out in outputs])
+        # _, pred = torch.max(torch.from_numpy(final), 1)
+        # pred = pred.cpu().numpy()
+        pred = final.argmax(axis=1)
+        batch_time.update(time.time() - end)
+        if save_vis:
+            save_output_images(pred, name, output_dir)
+            save_colorful_images(pred, name, output_dir + '_color',
+                                 CITYSCAPE_PALETTE)
+        if has_gt:
+            label = label.numpy()
+            hist += fast_hist(pred.flatten(), label.flatten(), num_classes)
+            logger.info('===> mAP {mAP:.3f}'.format(
+                mAP=round(np.nanmean(per_class_iu(hist)) * 100, 2)))
+        end = time.time()
+        logger.info('Eval: [{0}/{1}]\t'
+                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                    .format(iter, len(eval_data_loader), batch_time=batch_time,
+                            data_time=data_time))
+    if has_gt: #val
+        ious = per_class_iu(hist) * 100
+        logger.info(' '.join('{:.03f}'.format(i) for i in ious))
+        return round(np.nanmean(ious), 2)
+
+
+def test_seg(gpu, ngpus_per_node, args):
+    batch_size = args.batch_size
+    num_workers = args.workers
+    phase = args.phase
+
+    for k, v in args.__dict__.items():
+        print(k, ':', v)
+
+    single_model = DRNSeg(args.arch, args.classes, pretrained_model=None,
+                          pretrained=False)
+    if args.pretrained:
+        single_model.load_state_dict(torch.load(args.pretrained))
+
+    model = torch.nn.DataParallel(single_model).npu()
+      
+    data_dir = args.data_dir
+    info = json.load(open(join(data_dir, 'info.json'), 'r'))
+    normalize = transforms.Normalize(mean=info['mean'], std=info['std'])
+    scales = [0.5, 0.75, 1.25, 1.5, 1.75]
+    if args.ms:
+        dataset = SegListMS(data_dir, phase, transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ]), scales, list_dir=args.list_dir)
+    else:
+        dataset = SegList(data_dir, phase, transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ]), list_dir=args.list_dir, out_name=True)
+    test_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size, shuffle=False, num_workers=num_workers,
+        pin_memory=False
+    )
+
+    cudnn.benchmark = True
+
+    # optionally resume from a checkpoint
+    start_epoch = 0
+    if args.resume:
+        if os.path.isfile(args.resume):
+            logger.info("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            start_epoch = checkpoint['epoch']
+            best_prec1 = checkpoint['best_prec1']
+            model.load_state_dict(checkpoint['state_dict'])
+            logger.info("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            logger.info("=> no checkpoint found at '{}'".format(args.resume))
+
+    out_dir = '{}_{:03d}_{}'.format(args.arch, start_epoch, phase)
+    if len(args.test_suffix) > 0:
+        out_dir += '_' + args.test_suffix
+    if args.ms:
+        out_dir += '_ms'
+
+    if args.ms:
+        mAP = test_ms(test_loader, model, args.classes, save_vis=True,
+                      has_gt=phase != 'test' or args.with_gt,
+                      output_dir=out_dir,
+                      scales=scales)
+    else:
+        mAP = test(test_loader, model, args.classes, save_vis=True,
+                   has_gt=phase != 'test' or args.with_gt, output_dir=out_dir)
+    logger.info('mAP: %f', mAP)
+
+
+def parse_args():
+    # Training settings
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('cmd', choices=['train', 'test'])
+    parser.add_argument('--data-dir', default=None, required=True)
+    parser.add_argument('--list-dir', default=None,
+                        help='List dir to look for train_images.txt etc. '
+                             'It is the same with --data-dir if not set.')
+    parser.add_argument('--classes', default=0, type=int)
+    parser.add_argument('--crop-size', default=0, type=int)
+    parser.add_argument('--step', type=int, default=200)
+    parser.add_argument('--arch')
+    parser.add_argument('--batch-size', type=int, default=8, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--lr-mode', type=str, default='step')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='SGD momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)')
+    parser.add_argument('--evaluate', dest='evaluate',
+                        action='store_true',
+                        help='evaluate model on validation set')
+    parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                        help='path to latest checkpoint (default: none)')
+    parser.add_argument('--pretrained', dest='pretrained',
+                        default='', type=str, metavar='PATH',
+                        help='use pre-trained model')
+    parser.add_argument('--save_path', default='', type=str, metavar='PATH',
+                        help='output path for training checkpoints')
+    parser.add_argument('--save_iter', default=1, type=int,
+                        help='number of training iterations between'
+                             'checkpoint history saves')
+    parser.add_argument('--workers', type=int, default=8)
+    parser.add_argument('--load-release', dest='load_rel', default=None)
+    parser.add_argument('--phase', default='val')
+    parser.add_argument('--random-scale', default=0, type=float)
+    parser.add_argument('--random-rotate', default=0, type=int)
+    parser.add_argument('--bn-sync', action='store_true')
+    parser.add_argument('--ms', action='store_true',
+                        help='Turn on multi-scale testing')
+    parser.add_argument('--with-gt', action='store_true')
+    parser.add_argument('--test-suffix', default='', type=str)
+    ## distuributed
+    parser.add_argument('--rank', default=0, type=int,
+                    help='node rank for distributed training')
+    parser.add_argument('--world-size', default=1, type=int,
+                    help='number of nodes for distributed training')
+    parser.add_argument('--dist-url', default='tcp://127.0.0.1:23456', type=str,
+                        help='url used to set up distributed training')
+    parser.add_argument('--dist-backend', default='hccl', type=str,
+                        help='distributed backend')
+    parser.add_argument('--seed', default=None, type=int,
+                        help='seed for initializing training. ')
+    parser.add_argument('--gpu', default=0, type=int,
+                        help='GPU id to use.')
+    parser.add_argument('--multiprocessing-distributed', action='store_true',
+                        help='Use multi-processing distributed training to launch '
+                             'N processes per node, which has N GPUs. This is the '
+                             'fastest way to use PyTorch for either single node or '
+                             'multi node data parallel training')
+    ## for ascend 910
+    parser.add_argument('--device', default='npu', type=str, help='npu or gpu')
+    parser.add_argument('--device_list', default='0,1,2,3,4,5,6,7',
+                        type=str, help='device id list')
+    parser.add_argument('--amp', default=False, action='store_true',
+                        help='use amp to train the model')
+    parser.add_argument('--loss-scale', default=128, type=str,
+                        help='loss scale using in amp, default -1 means dynamic')
+    parser.add_argument('--opt-level', default='O2', type=str,
+                        help='loss scale using in amp, default -1 means dynamic')
+    parser.add_argument('--prof', default=False, action='store_true',
+                        help='use profiling to evaluate the performance of model')
+
+    args = parser.parse_args()
+
+    assert args.classes > 0
+
+    print(' '.join(sys.argv))
+    print(args)
+
+    if args.bn_sync:
+        drn.BatchNorm = batchnormsync.BatchNormSync
+
+    return args
+    
+def device_id_to_process_device_map(device_list):
+    devices = device_list.split(",")
+    devices = [int(x) for x in devices]
+    devices.sort()
+
+    process_device_map = dict()
+    for process_id, device_id in enumerate(devices):
+        process_device_map[process_id] = device_id
+
+    return process_device_map
+
+
+def main():
+    args = parse_args()
+    
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '29688'
+    os.environ['RANK_SIZE'] = '8'
+    
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+
+            
+    if args.gpu is not None:
+        print('You have chosen a specific GPU. This will completely disable data parallelism.')
+				  
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])          
+		
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    args.process_device_map = device_id_to_process_device_map(args.device_list)
+
+    if args.device == 'npu':
+        ngpus_per_node = len(args.process_device_map)
+    else:
+        if args.distributed:
+            ngpus_per_node = torch.cuda.device_count()
+        else:
+            ngpus_per_node = 1
+    print('ngpus_per_node:', ngpus_per_node)
+    
+    if args.multiprocessing_distributed:
+        args.world_size = ngpus_per_node * args.world_size
+        if args.cmd == 'train':
+            mp.spawn(train_seg, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+        elif args.cmd == 'test':
+            mp.spawn(test_seg, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        if args.cmd == 'train':
+            train_seg(args.gpu, ngpus_per_node, args)
+        elif args.cmd == 'test':
+            test_seg(args.gpu, ngpus_per_node, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/env_npu.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/env_npu.sh
new file mode 100644
index 0000000000..31d792ea9b
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/env_npu.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+# export PATH=/usr/local/hdf5/bin:$PATH
+# export LD_LIBRARY_PATH=/usr/local/hdf5/lib:$LD_LIBRARY_PATH
+# export LIBRARY_PATH=/usr/local/hdf5/lib:$LIBRARY_PATH
+# export CPATH=/usr/local/hdf5/include:$CPATH
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64/:/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=${install_path}/nnae/latest/fwkacllib/lib64/:/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=0
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=0
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
new file mode 100644
index 0000000000..c7ead5ac95
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size resume RANK_SIZE
+# 网络名称，同目录名称
+Network="DRN_C_26"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+# checkpoint文件路径,以实际路径为准
+pth_path=""
+# 训练epoch
+train_epochs=250
+# 学习率
+learning_rate=0.01
+# 加载数据进程数
+workers=64
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --pth_path* ]];then
+        pth_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否传入 pth_path , 验证脚本需要传入此参数
+if [[ $pth_path == "" ]];then
+    echo "[Error] para \"pth_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+python3.7 segment.py test \
+          --data-dir ${data_path} \
+          --classes 19 \
+          --arch drn_c_26 \
+          --resume ${pth_path} \
+          --phase val \
+          --batch-size ${batch_size} \
+          --device npu \
+          > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+wait
+
+
+##################获取训练数据################
+# 训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+
+# 输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+# 训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+
+# 最后一个迭代loss值，不需要修改
+ActualLoss=`grep Test ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}.log | awk '{print $8}' | awk 'END {print}'`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
new file mode 100644
index 0000000000..12e5632ad2
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DRN_C_26"
+# 训练batch_size
+batch_size=8
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+# checkpoint文件路径,以实际路径为准
+pth_path=""
+# 训练epoch
+train_epochs=250
+# 指定训练所使用的npu device卡id
+device_id=0
+# 加载数据进程数
+workers=8
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --pth_path* ]];then
+        pth_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否传入 pth_path , 验证脚本需要传入此参数
+if [[ $pth_path == "" ]];then
+    echo "[Error] para \"pth_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    "[Error] device id must be config"
+    exit 1
+fi
+
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+nohup python3.7 segment.py train \
+          --data-dir ${data_path} \
+          --classes 19 \
+          --workers 8 \
+          --crop-size 896 \
+          --arch drn_c_26 \
+          --batch-size ${batch_size} \
+          --epochs 250 \
+          --lr 0.01 \
+          --momentum 0.9 \
+          --save_iter 10 \
+          --save_path './checkpoints' \
+          --device 'npu' \
+          --dist-backend 'hccl' \
+          --amp \
+          --loss-scale '128' \
+          --opt-level O2 \
+          --gpu 0 \
+          --pretrained ${pth_path} \
+          > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
new file mode 100644
index 0000000000..39e2d2d0a1
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DRN_C_26"
+# 训练batch_size
+batch_size=8
+# 训练使用的npu卡数
+export RANK_SIZE=1
+export WORLD_SIZE=1
+data_path_info=$1
+data_path=`echo ${data_path_info#*=}`
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+nohup python3.7 segment.py train \
+          --data-dir ${data_path} \
+          --classes 19 \
+          --workers 8 \
+          --crop-size 896 \
+          --arch drn_c_26 \
+          --batch-size ${batch_size} \
+          --epochs 250 \
+          --lr 0.01 \
+          --momentum 0.9 \
+          --save_iter 10 \
+          --save_path './checkpoints' \
+          --device 'npu' \
+          --dist-backend 'hccl' \
+          --amp \
+          --loss-scale '128' \
+          --opt-level O2 \
+          --gpu 0 \
+          > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
new file mode 100644
index 0000000000..5146250aea
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DRN_C_26"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=8
+export WORLD_SIZE=8
+data_path_info=$1
+data_path=`echo ${data_path_info#*=}`
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+RANK_ID_START=0
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++))
+do
+    echo ${RANK_ID}
+    KERNEL_NUM=$(($(nproc)/8))
+    PID_START=$((KERNEL_NUM * RANK_ID))
+    PID_END=$((PID_START + KERNEL_NUM - 1))
+    nohup taskset -c $PID_START-$PID_END python3.7 segment.py train \
+        --data-dir ${data_path} \
+        --classes 19 \
+        --workers 64 \
+        --crop-size 896 \
+        --arch drn_c_26 \
+        --batch-size ${batch_size} \
+        --epochs 250 \
+        --lr 0.01 \
+        --momentum 0.9 \
+        --save_iter 10 \
+        --save_path './checkpoints' \
+        --device 'npu' \
+        --dist-backend 'hccl' \
+        --amp \
+        --loss-scale '128' \
+        --opt-level O2 \
+        --gpu 0 \
+        --multiprocessing-distributed \
+        --device_list '0,1,2,3,4,5,6,7' \
+        > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
new file mode 100644
index 0000000000..267d9dab84
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DRN_C_26"
+# 训练batch_size
+batch_size=8
+# 训练使用的npu卡数
+export RANK_SIZE=1
+export WORLD_SIZE=1
+data_path_info=$1
+data_path=`echo ${data_path_info#*=}`
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+nohup python3.7 segment.py train \
+          --data-dir ${data_path} \
+          --classes 19 \
+          --workers 8 \
+          --crop-size 896 \
+          --arch drn_c_26 \
+          --batch-size ${batch_size} \
+          --epochs 2 \
+          --lr 0.01 \
+          --momentum 0.9 \
+          --save_iter 10 \
+          --save_path './checkpoints' \
+          --device 'npu' \
+          --dist-backend 'hccl' \
+          --amp \
+          --loss-scale '128' \
+          --opt-level O2 \
+          --gpu 0 \
+          > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
new file mode 100644
index 0000000000..62d0ba90ba
--- /dev/null
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DRN_C_26"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=8
+export WORLD_SIZE=8
+data_path_info=$1
+data_path=`echo ${data_path_info#*=}`
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+RANK_ID_START=0
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++))
+do
+    echo ${RANK_ID}
+    KERNEL_NUM=$(($(nproc)/8))
+    PID_START=$((KERNEL_NUM * RANK_ID))
+    PID_END=$((PID_START + KERNEL_NUM - 1))
+    nohup taskset -c $PID_START-$PID_END python3.7 segment.py train \
+        --data-dir ${data_path} \
+        --classes 19 \
+        --workers 64 \
+        --crop-size 896 \
+        --arch drn_c_26 \
+        --batch-size ${batch_size} \
+        --epochs 2 \
+        --lr 0.01 \
+        --momentum 0.9 \
+        --save_iter 10 \
+        --save_path './checkpoints' \
+        --device 'npu' \
+        --dist-backend 'hccl' \
+        --amp \
+        --loss-scale '128' \
+        --opt-level O2 \
+        --gpu 0 \
+        --multiprocessing-distributed \
+        --device_list '0,1,2,3,4,5,6,7' \
+        > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
-- 
Gitee


From dbf30c3d29ff0d6af42c76d23ef8b645d4e0c57c Mon Sep 17 00:00:00 2001
From: lihang <lihang94@huawei.com>
Date: Tue, 13 Sep 2022 15:02:23 +0800
Subject: [PATCH 02/10] =?UTF-8?q?[=E8=A5=BF=E5=AE=89=E4=BA=A4=E9=80=9A?=
 =?UTF-8?q?=E5=A4=A7=E5=AD=A6][=E9=AB=98=E6=A0=A1=E8=B4=A1=E7=8C=AE][Pytor?=
 =?UTF-8?q?ch][DilatedResidualNetworks]--=E5=88=9D=E6=AC=A1=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cv/semantic_segmentation/DilatedResidualNetworks/LICENSE     | 1 +
 1 file changed, 1 insertion(+)
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
old mode 100644
new mode 100755
index fa92d33e58..7e273dc765
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
@@ -2,6 +2,7 @@ BSD 3-Clause License
 
 Copyright (c) 2017, Fisher Yu
 All rights reserved.
+Copyright 2021 Huawei Technologies Co., Ltd
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
-- 
Gitee


From c1236b2210fbc14c16caa23457e15cc0309d7c9a Mon Sep 17 00:00:00 2001
From: lihang <lihang94@huawei.com>
Date: Wed, 14 Sep 2022 14:30:45 +0800
Subject: [PATCH 03/10] =?UTF-8?q?[=E8=A5=BF=E5=AE=89=E4=BA=A4=E9=80=9A?=
 =?UTF-8?q?=E5=A4=A7=E5=AD=A6][=E9=AB=98=E6=A0=A1=E8=B4=A1=E7=8C=AE][Pytor?=
 =?UTF-8?q?ch][DilatedResidualNetworks]--=E5=88=9D=E6=AC=A1=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DilatedResidualNetworks/LICENSE                    |  2 +-
 .../DilatedResidualNetworks/README.md                  |  2 +-
 .../DilatedResidualNetworks/classify.py                | 10 ++++++++++
 .../DilatedResidualNetworks/data_transforms.py         | 10 ++++++++++
 .../datasets/compute_mean_std.py                       | 10 ++++++++++
 .../DilatedResidualNetworks/drn.py                     | 10 ++++++++++
 .../DilatedResidualNetworks/lib/build.py               | 10 ++++++++++
 .../lib/functions/batchnormp.py                        | 10 ++++++++++
 .../lib/modules/batchnormsync.py                       | 10 ++++++++++
 .../DilatedResidualNetworks/lib/test.py                | 10 ++++++++++
 .../DilatedResidualNetworks/segment.py                 | 10 ++++++++++
 .../DilatedResidualNetworks/test/train_eval_8p.sh      |  4 ++--
 .../DilatedResidualNetworks/test/train_full_8p.sh      |  5 +++--
 .../test/train_performance_8p.sh                       |  4 ++--
 14 files changed, 99 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
index 7e273dc765..53c06bb07f 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/LICENSE
@@ -2,7 +2,7 @@ BSD 3-Clause License
 
 Copyright (c) 2017, Fisher Yu
 All rights reserved.
-Copyright 2021 Huawei Technologies Co., Ltd
+Copyright 2022 Huawei Technologies Co., Ltd
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
old mode 100644
new mode 100755
index 4a393562f4..eaf4cc9457
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
@@ -243,7 +243,7 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
 | 1p-竞品 | -     |  12.536 | 1      |        - |
 | 1p-NPU  | -     |  19.868 | 1      |       O2 |
 | 8p-竞品 | 68.67 | 100.221 | 250    |        - |
-| 8p-NPU  | 68.64 | 125.545 | 250    |       O2 |
+| 8p-NPU  | 68.42 | 95.236 | 250    |       O2 |
 
 
 # 版本说明
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py
old mode 100644
new mode 100755
index dd123d0052..b450897938
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/classify.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import argparse
 import shutil
 import time
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py
old mode 100644
new mode 100755
index 616881c594..03d6b1f14d
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/data_transforms.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import numbers
 import random
 
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py
old mode 100644
new mode 100755
index df5ed75118..647e68d46e
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/datasets/compute_mean_std.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import argparse
 import json
 import numpy as np
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py
old mode 100644
new mode 100755
index 7201d0bc43..0f904d28ff
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/drn.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import pdb
 
 import torch.nn as nn
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py
old mode 100644
new mode 100755
index a6d7926117..458996ca47
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/build.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import glob
 import os
 import torch
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py
old mode 100644
new mode 100755
index cce2321ca7..2b64a56ddb
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/functions/batchnormp.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import pdb
 
 import numpy as np
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py
old mode 100644
new mode 100755
index 644249feca..b200313cd7
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/modules/batchnormsync.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 from queue import Queue
 
 import torch
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py
old mode 100644
new mode 100755
index 9b74bc821b..0b3af15d54
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/lib/test.py
@@ -1,3 +1,13 @@
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import pdb
 import time
 import logging
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
old mode 100644
new mode 100755
index 88d5dc94d7..cf99d45a9f
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
@@ -1,5 +1,15 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+# Copyright (c) 2017, Fisher Yu
+# All rights reserved.
+# Copyright 2022 Huawei Technologies Co., Ltd
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DilatedResidualNetworks:https://github.com/fyu/drn
+# --------------------------------------------------------
 import random
 import argparse
 import json
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
old mode 100644
new mode 100755
index c7ead5ac95..e190d38528
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
@@ -5,7 +5,7 @@
 # 网络名称，同目录名称
 Network="DRN_C_26"
 # 训练batch_size
-batch_size=64
+batch_size=1
 # 训练使用的npu卡数
 export RANK_SIZE=8
 # 数据集路径,保持为空,不需要修改
@@ -17,7 +17,7 @@ train_epochs=250
 # 学习率
 learning_rate=0.01
 # 加载数据进程数
-workers=64
+workers=8
 
 
 # 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
old mode 100644
new mode 100755
index 5146250aea..1124398895
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
@@ -5,7 +5,7 @@
 # 网络名称，同目录名称
 Network="DRN_C_26"
 # 训练batch_size
-batch_size=64
+batch_size=32
 # 训练使用的npu卡数
 export RANK_SIZE=8
 export WORLD_SIZE=8
@@ -73,7 +73,7 @@ do
     nohup taskset -c $PID_START-$PID_END python3.7 segment.py train \
         --data-dir ${data_path} \
         --classes 19 \
-        --workers 64 \
+        --workers 16 \
         --crop-size 896 \
         --arch drn_c_26 \
         --batch-size ${batch_size} \
@@ -85,6 +85,7 @@ do
         --device 'npu' \
         --dist-backend 'hccl' \
         --amp \
+        --seed 1334 \
         --loss-scale '128' \
         --opt-level O2 \
         --gpu 0 \
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
old mode 100644
new mode 100755
index 62d0ba90ba..1c6cc724dc
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
@@ -5,7 +5,7 @@
 # 网络名称，同目录名称
 Network="DRN_C_26"
 # 训练batch_size
-batch_size=64
+batch_size=32
 # 训练使用的npu卡数
 export RANK_SIZE=8
 export WORLD_SIZE=8
@@ -73,7 +73,7 @@ do
     nohup taskset -c $PID_START-$PID_END python3.7 segment.py train \
         --data-dir ${data_path} \
         --classes 19 \
-        --workers 64 \
+        --workers 16 \
         --crop-size 896 \
         --arch drn_c_26 \
         --batch-size ${batch_size} \
-- 
Gitee


From 0031b3f31cb661d17f56f7edd8e9187feab52b3c Mon Sep 17 00:00:00 2001
From: lihang <lihang94@huawei.com>
Date: Wed, 14 Sep 2022 21:57:25 +0800
Subject: [PATCH 04/10] =?UTF-8?q?[=E8=A5=BF=E5=AE=89=E4=BA=A4=E9=80=9A?=
 =?UTF-8?q?=E5=A4=A7=E5=AD=A6][=E9=AB=98=E6=A0=A1=E8=B4=A1=E7=8C=AE][Pytor?=
 =?UTF-8?q?ch][DilatedResidualNetworks]--=E5=88=9D=E6=AC=A1=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../semantic_segmentation/DilatedResidualNetworks/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
index eaf4cc9457..268cc3beae 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
@@ -167,7 +167,7 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
 
 ## 获取预训练模型
 
-可以在此处下载预训练 DRN 模型：http://dl.yf.io/drn/drn_c_26-ddedf421.pth，放入/root/.cache/torch/hub/ checkpoints/
+运行代码会自动下载与训练模型，如果无法下载，可以在此处下载预训练 DRN 模型：http://dl.yf.io/drn/drn_c_26-ddedf421.pth，放入/root/.cache/torch/hub/ checkpoints/
 
 # 开始训练
 
@@ -190,7 +190,7 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
      ```
      bash ./test/train_performance_1p.sh --data_path='./datasets/cityscapes'  # 1p性能
      bash ./test/train_full_1p.sh --data_path='./datasets/cityscapes'         # 1p精度 
-     bash ./test/train_finetune_1p.sh --data_path='./datasets/cityscapes' --pth_path='./checkpoints/xxx.pth.tar'  # 1p模型微调
+     bash ./test/train_finetune_1p.sh --data_path='./datasets/cityscapes' --pth_path='./checkpoints/xxx.pth.tar'  # 1p模型迁移
      ```
 
    - 单机8卡训练
@@ -242,7 +242,7 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
 | :-------: | :-----: | :------: | :------: | :-------: |
 | 1p-竞品 | -     |  12.536 | 1      |        - |
 | 1p-NPU  | -     |  19.868 | 1      |       O2 |
-| 8p-竞品 | 68.67 | 100.221 | 250    |        - |
+| 8p-竞品 | 68.45 | 95.107 | 250    |        - |
 | 8p-NPU  | 68.42 | 95.236 | 250    |       O2 |
 
 
-- 
Gitee


From 6c5042b27f4eef170bc23ff711b02debf88f0475 Mon Sep 17 00:00:00 2001
From: lihang <lihang94@huawei.com>
Date: Thu, 15 Sep 2022 13:31:24 +0800
Subject: [PATCH 05/10] =?UTF-8?q?README=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cv/semantic_segmentation/DilatedResidualNetworks/README.md  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
index 268cc3beae..f525624415 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
@@ -242,7 +242,7 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
 | :-------: | :-----: | :------: | :------: | :-------: |
 | 1p-竞品 | -     |  12.536 | 1      |        - |
 | 1p-NPU  | -     |  19.868 | 1      |       O2 |
-| 8p-竞品 | 68.45 | 95.107 | 250    |        - |
+| 8p-竞品 | 67.95 | 95.107 | 250    |        - |
 | 8p-NPU  | 68.42 | 95.236 | 250    |       O2 |
 
 
-- 
Gitee


From 19559b1e7daeee2b637f84c3b9eb06ec39786d48 Mon Sep 17 00:00:00 2001
From: woey <876362620@qq.com>
Date: Tue, 29 Nov 2022 17:16:59 +0800
Subject: [PATCH 06/10] =?UTF-8?q?=E8=84=9A=E6=9C=AC=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DilatedResidualNetworks/README.md                      | 5 ++++-
 .../DilatedResidualNetworks/test/train_eval_8p.sh          | 2 +-
 .../DilatedResidualNetworks/test/train_finetune_1p.sh      | 3 ++-
 .../DilatedResidualNetworks/test/train_full_1p.sh          | 7 ++++---
 .../DilatedResidualNetworks/test/train_full_8p.sh          | 7 ++++---
 .../DilatedResidualNetworks/test/train_performance_1p.sh   | 7 ++++---
 .../DilatedResidualNetworks/test/train_performance_8p.sh   | 7 ++++---
 7 files changed, 23 insertions(+), 15 deletions(-)
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
 mode change 100644 => 100755 PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
index f525624415..4e741ae06f 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
@@ -167,7 +167,7 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
 
 ## 获取预训练模型
 
-运行代码会自动下载与训练模型，如果无法下载，可以在此处下载预训练 DRN 模型：http://dl.yf.io/drn/drn_c_26-ddedf421.pth，放入/root/.cache/torch/hub/ checkpoints/
+运行代码会自动下载与训练模型，如果无法下载，可以在此处下载预训练 DRN 模型：http://dl.yf.io/drn/drn_c_26-ddedf421.pth，放入/root/.cache/torch/checkpoints/
 
 # 开始训练
 
@@ -181,6 +181,9 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
 
 2. 运行训练脚本。
    注：需要在代码根目录路径下新建一个名为checkpoints的文件夹来存放pth模型文件。
+    ```
+    mkdir checkpoints
+    ```
    该模型支持单机单卡训练和单机8卡训练。
 
    - 单机单卡训练
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
index e190d38528..9af36ab08f 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_eval_8p.sh
@@ -97,7 +97,7 @@ e2e_time=$(( $end_time - $start_time ))
 echo "------------------ Final result ------------------"
 
 # 输出训练精度,需要模型审视修改
-train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+train_accuracy=`grep -a 'mAP:'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "mAP:" '{print $NF}'|awk -F " " '{print $1}'`
 #打印，不需要修改
 echo "Final Train Accuracy : ${train_accuracy}"
 echo "E2E Training Duration sec : $e2e_time"
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
old mode 100644
new mode 100755
index 12e5632ad2..64b1dd75b1
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
@@ -126,7 +126,8 @@ echo "Final Performance images/sec : $FPS"
 #输出训练精度,需要模型审视修改
 train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
 #打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
+#echo "Final Train Accuracy : ${train_accuracy}"
+echo "Final Train Accuracy : Execute the eval script to get precision"
 echo "E2E Training Duration sec : $e2e_time"
 
 #性能看护结果汇总
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
old mode 100644
new mode 100755
index 39e2d2d0a1..102db34dd6
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
@@ -92,14 +92,15 @@ e2e_time=$(( $end_time - $start_time ))
 #结果打印，不需要修改
 echo "------------------ Final result ------------------"
 #输出性能FPS，需要模型审视修改
-FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
 #打印，不需要修改
 echo "Final Performance images/sec : $FPS"
 
 #输出训练精度,需要模型审视修改
 train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
 #打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
+#echo "Final Train Accuracy : ${train_accuracy}"
+echo "Final Train Accuracy : Execute the eval script to get precision"
 echo "E2E Training Duration sec : $e2e_time"
 
 #性能看护结果汇总
@@ -115,7 +116,7 @@ ActualFPS=${FPS}
 TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
 
 #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
 
 #最后一个迭代loss值，不需要修改
 ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
index 1124398895..92ca971dcc 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
@@ -105,14 +105,15 @@ e2e_time=$(( $end_time - $start_time ))
 #结果打印，不需要修改
 echo "------------------ Final result ------------------"
 #输出性能FPS，需要模型审视修改
-FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
 #打印，不需要修改
 echo "Final Performance images/sec : $FPS"
 
 #输出训练精度,需要模型审视修改
 train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
 #打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
+#echo "Final Train Accuracy : ${train_accuracy}"
+echo "Final Train Accuracy : Execute the eval script to get precision"
 echo "E2E Training Duration sec : $e2e_time"
 
 #性能看护结果汇总
@@ -128,7 +129,7 @@ ActualFPS=${FPS}
 TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
 
 #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
 
 #最后一个迭代loss值，不需要修改
 ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
old mode 100644
new mode 100755
index 267d9dab84..29286d886d
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
@@ -92,14 +92,15 @@ e2e_time=$(( $end_time - $start_time ))
 #结果打印，不需要修改
 echo "------------------ Final result ------------------"
 #输出性能FPS，需要模型审视修改
-FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
 #打印，不需要修改
 echo "Final Performance images/sec : $FPS"
 
 #输出训练精度,需要模型审视修改
 train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
 #打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
+#echo "Final Train Accuracy : ${train_accuracy}"
+echo "Final Train Accuracy : Execute the eval script to get precision"
 echo "E2E Training Duration sec : $e2e_time"
 
 #性能看护结果汇总
@@ -115,7 +116,7 @@ ActualFPS=${FPS}
 TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
 
 #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
 
 #最后一个迭代loss值，不需要修改
 ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
index 1c6cc724dc..8e41166189 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
@@ -104,14 +104,15 @@ e2e_time=$(( $end_time - $start_time ))
 #结果打印，不需要修改
 echo "------------------ Final result ------------------"
 #输出性能FPS，需要模型审视修改
-FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
 #打印，不需要修改
 echo "Final Performance images/sec : $FPS"
 
 #输出训练精度,需要模型审视修改
 train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
 #打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
+#echo "Final Train Accuracy : ${train_accuracy}"
+echo "Final Train Accuracy : Execute the eval script to get precision"
 echo "E2E Training Duration sec : $e2e_time"
 
 #性能看护结果汇总
@@ -127,7 +128,7 @@ ActualFPS=${FPS}
 TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
 
 #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep Train: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
 
 #最后一个迭代loss值，不需要修改
 ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
-- 
Gitee


From d17e099b7b1316b37f88eccf0e047060ec5e221f Mon Sep 17 00:00:00 2001
From: woey <876362620@qq.com>
Date: Wed, 30 Nov 2022 20:33:56 +0800
Subject: [PATCH 07/10] =?UTF-8?q?=E2=80=98=E8=84=9A=E6=9C=AC=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DilatedResidualNetworks/segment.py          | 17 +++++------------
 .../test/train_full_8p.sh                       |  2 +-
 .../test/train_performance_8p.sh                |  2 +-
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
index cf99d45a9f..a1c11a1c67 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
@@ -394,7 +394,7 @@ def train_seg(gpu, ngpus_per_node, args):
     
     if args.gpu is not None:
         print("Use GPU: {} for training".format(args.gpu))
-
+    args.world_size = int(os.environ["WORLD_SIZE"])
     if args.distributed:
         if args.dist_url == "env://" and args.rank == -1:
             args.rank = int(os.environ["RANK"])
@@ -1029,17 +1029,10 @@ def main():
             ngpus_per_node = 1
     print('ngpus_per_node:', ngpus_per_node)
     
-    if args.multiprocessing_distributed:
-        args.world_size = ngpus_per_node * args.world_size
-        if args.cmd == 'train':
-            mp.spawn(train_seg, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
-        elif args.cmd == 'test':
-            mp.spawn(test_seg, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
-    else:
-        if args.cmd == 'train':
-            train_seg(args.gpu, ngpus_per_node, args)
-        elif args.cmd == 'test':
-            test_seg(args.gpu, ngpus_per_node, args)
+    if args.cmd == 'train':
+        train_seg(args.gpu, ngpus_per_node, args)
+    elif args.cmd == 'test':
+        test_seg(args.gpu, ngpus_per_node, args)
 
 
 if __name__ == '__main__':
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
index 92ca971dcc..d1e9cffaf9 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
@@ -88,7 +88,7 @@ do
         --seed 1334 \
         --loss-scale '128' \
         --opt-level O2 \
-        --gpu 0 \
+        --gpu ${RANK_ID} \
         --multiprocessing-distributed \
         --device_list '0,1,2,3,4,5,6,7' \
         > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
index 8e41166189..11eefc6e4c 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
@@ -87,7 +87,7 @@ do
         --amp \
         --loss-scale '128' \
         --opt-level O2 \
-        --gpu 0 \
+        --gpu ${RANK_ID} \
         --multiprocessing-distributed \
         --device_list '0,1,2,3,4,5,6,7' \
         > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-- 
Gitee


From b064af761ac159f3185e084f5a1807372e5c9534 Mon Sep 17 00:00:00 2001
From: woey <876362620@qq.com>
Date: Wed, 30 Nov 2022 21:00:09 +0800
Subject: [PATCH 08/10] =?UTF-8?q?readme=E6=9B=B4=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cv/semantic_segmentation/DilatedResidualNetworks/README.md  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
index 4e741ae06f..353569118a 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/README.md
@@ -167,7 +167,7 @@ DRN是在卷积网络中保持空间分辨率以进行图像分类和分割的
 
 ## 获取预训练模型
 
-运行代码会自动下载与训练模型，如果无法下载，可以在此处下载预训练 DRN 模型：http://dl.yf.io/drn/drn_c_26-ddedf421.pth，放入/root/.cache/torch/checkpoints/
+运行代码会自动下载与训练模型，如果无法下载，可以在此处下载预训练 DRN 模型：http://dl.yf.io/drn/drn_c_26-ddedf421.pth，放入默认路径下，通常为：/root/.cache/torch/hub/checkpoints/
 
 # 开始训练
 
-- 
Gitee


From c41c3d4f30265c26845420ff29241299c4154fc7 Mon Sep 17 00:00:00 2001
From: woey <876362620@qq.com>
Date: Thu, 1 Dec 2022 19:03:33 +0800
Subject: [PATCH 09/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DilatedResidualNetworks/segment.py               | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
index a1c11a1c67..41d9f45b89 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/segment.py
@@ -394,8 +394,9 @@ def train_seg(gpu, ngpus_per_node, args):
     
     if args.gpu is not None:
         print("Use GPU: {} for training".format(args.gpu))
-    args.world_size = int(os.environ["WORLD_SIZE"])
+    
     if args.distributed:
+        args.world_size = int(os.environ["WORLD_SIZE"])
         if args.dist_url == "env://" and args.rank == -1:
             args.rank = int(os.environ["RANK"])
         if args.multiprocessing_distributed:
@@ -456,14 +457,14 @@ def train_seg(gpu, ngpus_per_node, args):
             print("before : model = torch.nn.DataParallel(model).cuda()")
      
     # define loss function (criterion) and pptimizer
-    optimizer = torch.optim.SGD(single_model.optim_parameters(),
+    optimizer = apex.optimizers.NpuFusedSGD(single_model.optim_parameters(),
                                 args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay)
 
     if args.amp:
         model, optimizer = amp.initialize(
-            model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
+            model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=True)
 
     if args.distributed:
         if args.gpu is not None:
@@ -475,8 +476,7 @@ def train_seg(gpu, ngpus_per_node, args):
         else:
             model = torch.nn.parallel.DistributedDataParallel(model)
     elif args.gpu is not None:
-        print("[gpu id:", args.gpu, "]",
-              "============================test   elif args.gpu is not None:==========================")
+        print("[gpu id:", args.gpu, "]")
     else:
         if args.device == 'npu':
             loc = 'npu:{}'.format(args.gpu)
@@ -586,7 +586,7 @@ def train_seg(gpu, ngpus_per_node, args):
         best_prec1 = max(prec1, best_prec1)
 
         if args.device == 'npu' and args.gpu == 0 and epoch == args.epochs - 1:
-            print("Complete 250 epoch training, take time:{}h".format(round((time.time() - start_time) / 3600.0, 2)))   
+            print("Complete training, take time:{}h".format(round((time.time() - start_time) / 3600.0, 2)))   
           
         if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):        
             checkpoint_path = os.path.join(args.save_path, 'checkpoint_latest.pth.tar')
-- 
Gitee


From 29b4967f981698cfa46ecf3aa638d212a6d71430 Mon Sep 17 00:00:00 2001
From: woey <876362620@qq.com>
Date: Fri, 9 Dec 2022 19:12:59 +0800
Subject: [PATCH 10/10] =?UTF-8?q?=E8=84=9A=E6=9C=AC=E4=BF=AE=E6=94=B9?=
 =?UTF-8?q?=E5=8F=82=E6=95=B0step?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DilatedResidualNetworks/test/train_finetune_1p.sh            | 1 +
 .../DilatedResidualNetworks/test/train_full_1p.sh                | 1 +
 .../DilatedResidualNetworks/test/train_full_8p.sh                | 1 +
 .../DilatedResidualNetworks/test/train_performance_1p.sh         | 1 +
 .../DilatedResidualNetworks/test/train_performance_8p.sh         | 1 +
 5 files changed, 5 insertions(+)

diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
index 64b1dd75b1..ed5ef44e17 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_finetune_1p.sh
@@ -97,6 +97,7 @@ nohup python3.7 segment.py train \
           --epochs 250 \
           --lr 0.01 \
           --momentum 0.9 \
+          --step 100 \
           --save_iter 10 \
           --save_path './checkpoints' \
           --device 'npu' \
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
index 102db34dd6..19c16732d2 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_1p.sh
@@ -71,6 +71,7 @@ nohup python3.7 segment.py train \
           --epochs 250 \
           --lr 0.01 \
           --momentum 0.9 \
+          --step 100 \
           --save_iter 10 \
           --save_path './checkpoints' \
           --device 'npu' \
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
index d1e9cffaf9..2596c3c7b4 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_full_8p.sh
@@ -80,6 +80,7 @@ do
         --epochs 250 \
         --lr 0.01 \
         --momentum 0.9 \
+        --step 100 \
         --save_iter 10 \
         --save_path './checkpoints' \
         --device 'npu' \
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
index 29286d886d..098c436c04 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_1p.sh
@@ -71,6 +71,7 @@ nohup python3.7 segment.py train \
           --epochs 2 \
           --lr 0.01 \
           --momentum 0.9 \
+          --step 100 \
           --save_iter 10 \
           --save_path './checkpoints' \
           --device 'npu' \
diff --git a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
index 11eefc6e4c..fc6d1a13ee 100755
--- a/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/semantic_segmentation/DilatedResidualNetworks/test/train_performance_8p.sh
@@ -80,6 +80,7 @@ do
         --epochs 2 \
         --lr 0.01 \
         --momentum 0.9 \
+        --step 100 \
         --save_iter 10 \
         --save_path './checkpoints' \
         --device 'npu' \
-- 
Gitee