From da2bdac7020ecb534b1db131c7a1c135dc1142dd Mon Sep 17 00:00:00 2001 From: xli <1184313039@qq.com> Date: Tue, 27 Dec 2022 13:10:17 +0800 Subject: [PATCH] update mask rcnn 1.8 --- .../Faster_Mask_RCNN_for_PyTorch/README.md | 64 ++++--- .../detectron2/layers/mask_ops.py | 6 +- .../detectron2/utils/memory.py | 5 +- .../test/train_full_8p.sh | 4 +- .../test/train_mask_rcnn_performance_1p.sh | 146 +++++++++++++++ .../test/train_mask_rcnn_performance_8p.sh | 167 ++++++++++++++++++ 6 files changed, 363 insertions(+), 29 deletions(-) create mode 100644 PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_1p.sh create mode 100644 PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_8p.sh diff --git a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/README.md b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/README.md index a51b47c93f..0d1a33d801 100644 --- a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/README.md @@ -47,9 +47,9 @@ FasterRCNN是一个业界领先的目标检测网络,他继承了FastRCNN的 | 配套 | 版本 | | ---------- | ------------------------------------------------------------ | - | 固件与驱动 | [5.1.RC2](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | - | CANN | [5.1.RC2](https://www.hiascend.com/software/cann/commercial?version=5.1.RC2) | - | PyTorch | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/)| + | 固件与驱动 | [6.0.0.alpha002](https://www.hiascend.com/zh/hardware/firmware-drivers?tag=community&pId=5&mId=6&cann=23e6d0e47a1911edade3fa163edea0c5&ver=1.0.13.alpha) | + | CANN | [6.0.0.alpha002](https://www.hiascend.com/software/cann/communit) | + | PyTorch | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/) | - 环境准备指导。 @@ -115,24 +115,33 @@ python3.7 -m pip install -e Faster_Mask_RCNN_for_PyTorch 该模型支持单机单卡训练和单机8卡训练。 mask_rcnn启动训练 - - 单机单卡训练 - - 启动单卡训练。 - + + - 单机单卡性能 + + 启动1卡性能。 + ``` - bash ./test/train_full_1p.sh --data_path=数据集路径 + bash ./test/train_mask_rcnn_performance_1p.sh --data_path=数据集路径 ``` - + - 单机8卡训练 - + 启动8卡训练。 - + ``` bash ./test/train_full_8p.sh --data_path=数据集路径 ``` - - - 多机多卡性能数据获取流程 + - 单机8卡性能 + + 启动8卡性能。 + + ``` + bash ./test/train_mask_rcnn_performance_8p.sh --data_path=数据集路径 + ``` + + - 多机多卡性能数据获取流程 + ```shell 1. 安装环境 2. 开始训练,每个机器所请按下面提示进行配置 @@ -141,21 +150,21 @@ python3.7 -m pip install -e Faster_Mask_RCNN_for_PyTorch faster_rcnn启动训练 - 单机单卡训练 - + 启动单卡训练。 - + ``` bash ./test/train_faster_rcnn_full_1p.sh --data_path=数据集路径 ``` - + - 单机8卡训练 - + 启动8卡训练。 - + ``` bash ./test/train_faster_rcnn_full_8p.sh --data_path=数据集路径 ``` - + - 多机多卡性能数据获取流程 ```shell @@ -163,11 +172,11 @@ python3.7 -m pip install -e Faster_Mask_RCNN_for_PyTorch 2. 开始训练,每个机器所请按下面提示进行配置 bash ./test/train_faster_rcnn_performance_multinodes.sh --data_path=数据集路径 --batch_size=单卡batch_size*所有卡数 --nnodes=机器总数量 --node_rank=当前机器rank(0,1,2..) --local_addr=当前机器IP(需要和master_addr处于同一网段) --master_addr=主节点IP ``` - + --data\_path参数填写数据集路径。 - + 模型训练脚本参数说明如下。 - + ``` 公共参数: AMP # 开启混合精度 @@ -187,7 +196,7 @@ python3.7 -m pip install -e Faster_Mask_RCNN_for_PyTorch # 训练结果展示 -**表 2** 训练结果展示表 +**表 2** fast_rcnn训练结果展示表 | NAME | Acc@1 | FPS | Epochs | AMP_Type | | ------- | ----- | ---: | ------ | -------: | @@ -196,6 +205,15 @@ python3.7 -m pip install -e Faster_Mask_RCNN_for_PyTorch | 8p-NPU1.5 | 26.773 | 76.5 | - | O2 | | 8p-NPU1.8 | 27 | 86.3 | - | O2 | +**表3** mask rcnn 训练结果展示表 + +| NAME | Acc@1 | FPS | Epochs | AMP_Type | +| --------- | ----- | ------: | ------ | -------: | +| 1p-NPU1.5 | - | 6.531 | - | O2 | +| 1p-NPU1.8 | - | 6.538 | - | O2 | +| 8p-NPU1.5 | 26.6 | 31.8831 | - | O2 | +| 8p-NPU1.8 | 27.1 | 32.9152 | - | O2 | + # 版本说明 ## 变更 diff --git a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/layers/mask_ops.py b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/layers/mask_ops.py index f6ad215987..dfd1212937 100755 --- a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/layers/mask_ops.py +++ b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/layers/mask_ops.py @@ -71,9 +71,9 @@ def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True): gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1)) gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1)) grid = torch.stack([gx, gy], dim=3) - - img_masks = F.grid_sample(masks.to(dtype=torch.float32), grid, align_corners=False) - + + img_masks = F.grid_sample(masks.to(dtype=torch.float32), grid, padding_mode='border', align_corners=False) + if skip_empty: return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int)) else: diff --git a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/utils/memory.py b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/utils/memory.py index ca782a292e..0838c00a7a 100755 --- a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/utils/memory.py +++ b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/detectron2/utils/memory.py @@ -79,6 +79,7 @@ def retry_if_cuda_oom(func): @wraps(func) def wrapped(*args, **kwargs): + logger = logging.getLogger(__name__) with _ignore_torch_cuda_oom(): return func(*args, **kwargs) @@ -86,9 +87,9 @@ def retry_if_cuda_oom(func): torch.cuda.empty_cache() with _ignore_torch_cuda_oom(): return func(*args, **kwargs) - + # Try on CPU. This slows down the code significantly, therefore print a notice. - logger = logging.getLogger(__name__) + logger.info("Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func))) new_args = (maybe_to_cpu(x) for x in args) new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} diff --git a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_full_8p.sh index 6cf024f8ab..2394b4c75f 100644 --- a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_full_8p.sh @@ -12,6 +12,8 @@ else fi # 指定训练所使用的npu device卡id +Network="Mask_RCNN_for_PyTorch" +export RANK_SIZE=8 device_id=0 batch_size=64 #参数校验,不需要修改 @@ -70,7 +72,7 @@ nohup python3.7 tools/train_net.py \ OPT_LEVEL O2 \ LOSS_SCALE_VALUE 64 \ SOLVER.IMS_PER_BATCH 64 \ - SOLVER.MAX_ITER 10250 \ + SOLVER.MAX_ITER 12250 \ SEED 1234 \ MODEL.RPN.NMS_THRESH 0.8 \ MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO 2 \ diff --git a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_1p.sh b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_1p.sh new file mode 100644 index 0000000000..f781e365fa --- /dev/null +++ b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_1p.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +#集合通信参数,不需要修改 +export RANK_SIZE=1 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Mask_RCNN_for_PyTorch" +#训练batch_size +batch_size=8 +#训练step +train_steps=4000 + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 指定训练所使用的npu device卡id +device_id=0 + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#修改参数 +sed -i "s|\"coco_2017_train\": (\"coco/train2017\", \"coco/annotations/instances_train2017.json\")|\"coco_2017_train\": (\"$data_path/coco/train2017\", \"$data_path/coco/annotations/instances_train2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|\"coco_2017_val\": (\"coco/val2017\", \"coco/annotations/instances_val2017.json\")|\"coco_2017_val\": (\"$data_path/coco/val2017\", \"$data_path/coco/annotations/instances_val2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|WEIGHTS: \"detectron2://ImageNetPretrained/MSRA/R-101.pkl\"|WEIGHTS: \"$data_path/R-101.pkl\"|g" $cur_path/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml +wait + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +nohup python3.7 tools/train_net.py \ + --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml \ + AMP 1 \ + OPT_LEVEL O2 \ + LOSS_SCALE_VALUE 64 \ + MODEL.DEVICE npu:$ASCEND_DEVICE_ID \ + SOLVER.IMS_PER_BATCH $batch_size \ + SOLVER.MAX_ITER $train_steps \ + SEED 1234 \ + MODEL.RPN.NMS_THRESH 0.8 \ + MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO 2 \ + MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO 2 \ + DATALOADER.NUM_WORKERS 8 \ + SOLVER.BASE_LR 0.0025 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +wait + +#修改参数 +sed -i "s|\"coco_2017_train\": (\"$data_path/coco/train2017\", \"$data_path/coco/annotations/instances_train2017.json\")|\"coco_2017_train\": (\"coco/train2017\", \"coco/annotations/instances_train2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|\"coco_2017_val\": (\"$data_path/coco/val2017\", \"$data_path/coco/annotations/instances_val2017.json\")|\"coco_2017_val\": (\"coco/val2017\", \"coco/annotations/instances_val2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|WEIGHTS: \"$data_path/R-101.pkl\"|WEIGHTS: \"detectron2://ImageNetPretrained/MSRA/R-101.pkl\"|g" $cur_path/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep FPS $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $NF}'|awk '{sum+=$1} END {print sum/NR}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "Average Precision" $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "=" '{print $NF}'|head -1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep total_loss $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'total_loss: ' '{print $2}'|awk '{print $1}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_8p.sh b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_8p.sh new file mode 100644 index 0000000000..5b6e8a0a35 --- /dev/null +++ b/PyTorch/built-in/cv/detection/Faster_Mask_RCNN_for_PyTorch/test/train_mask_rcnn_performance_8p.sh @@ -0,0 +1,167 @@ +#!/bin/bash + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#集合通信参数,不需要修改 +export RANK_SIZE=8 + +# 数据集路径,保持为空,不需要修改 +data_path="" + + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Mask_RCNN_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=64 +#训练step +train_steps=200 + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/test/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/test/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/test/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 指定训练所使用的npu device卡id +device_id=0 + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt +fi + + +#修改参数 +sed -i "s|\"coco_2017_train\": (\"coco/train2017\", \"coco/annotations/instances_train2017.json\")|\"coco_2017_train\": (\"$data_path/coco/train2017\", \"$data_path/coco/annotations/instances_train2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|\"coco_2017_val\": (\"coco/val2017\", \"coco/annotations/instances_val2017.json\")|\"coco_2017_val\": (\"$data_path/coco/val2017\", \"$data_path/coco/annotations/instances_val2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|WEIGHTS: \"detectron2://ImageNetPretrained/MSRA/R-101.pkl\"|WEIGHTS: \"$data_path/R-101.pkl\"|g" $cur_path/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml +wait + +cd $cur_path/ +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +nohup python3.7 tools/train_net.py \ + --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml \ + --device-ids 0 1 2 3 4 5 6 7 \ + --num-gpus 8 \ + AMP 1 \ + OPT_LEVEL O2 \ + LOSS_SCALE_VALUE 64 \ + SOLVER.IMS_PER_BATCH $batch_size \ + SOLVER.MAX_ITER $train_steps \ + SEED 1234 \ + MODEL.RPN.NMS_THRESH 0.8 \ + MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO 2 \ + MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO 2 \ + DATALOADER.NUM_WORKERS 8 \ + SOLVER.BASE_LR 0.02 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +wait +#修改参数 +sed -i "s|\"coco_2017_train\": (\"$data_path/coco/train2017\", \"$data_path/coco/annotations/instances_train2017.json\")|\"coco_2017_train\": (\"coco/train2017\", \"coco/annotations/instances_train2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|\"coco_2017_val\": (\"$data_path/coco/val2017\", \"$data_path/coco/annotations/instances_val2017.json\")|\"coco_2017_val\": (\"coco/val2017\", \"coco/annotations/instances_val2017.json\")|g" $cur_path/detectron2/data/datasets/builtin.py +sed -i "s|WEIGHTS: \"$data_path/R-101.pkl\"|WEIGHTS: \"detectron2://ImageNetPretrained/MSRA/R-101.pkl\"|g" $cur_path/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep FPS $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $NF}'|awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "Average Precision" $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "=" '{print $NF}'|head -1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep total_loss $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'total_loss: ' '{print $2}'|awk '{print $1}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee