From 7a33fea97a890c03f7c9c37d0f92564833641c93 Mon Sep 17 00:00:00 2001 From: 18118216380 <3224925783@qq.com> Date: Mon, 25 Apr 2022 18:40:48 +0800 Subject: [PATCH 1/6] 4.25 --- .../run_squad.py | 6 +- .../test/train_performance_16p.sh | 226 ++++++++++++++++++ .../test/train_performance_16p.sh | 225 +++++++++++++++++ 3 files changed, 456 insertions(+), 1 deletion(-) create mode 100644 PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh create mode 100644 PyTorch/dev/cv/image_classification/Hourglass_ID1809_for_PyTorch/test/train_performance_16p.sh diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py index c245d2cb11..85b3c8280d 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py @@ -916,7 +916,11 @@ def main(): os.environ['MASTER_PORT'] = '29668' torch.npu.set_device("npu:%d" % args.local_rank) device = torch.device("npu:%d" % args.local_rank) - torch.distributed.init_process_group(backend='hccl', world_size=8, rank=args.local_rank) + if args.num_npu == 8: + torch.distributed.init_process_group(backend='hccl', world_size=8, rank=args.local_rank) + if args.num_npu == 16: + global_rank = int(os.getenv('RANK')) + torch.distributed.init_process_group(backend='hccl', world_size=16, rank=global_rank) n_npu = 1 else: torch.cuda.set_device(args.local_rank) diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000..ec2c8f2543 --- /dev/null +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,226 @@ +#!/bin/bash +export PATH=/usr/local/hdf5/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/hdf5/lib:$LD_LIBRARY_PATH +export LIBRARY_PATH=/usr/local/hdf5/lib:$LIBRARY_PATH +export CPATH=/usr/local/hdf5/include:$CPATH +export HDF5_DISABLE_VERSION_CHECK=1 +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export BMMV2_ENABLE=1 +export RANK_SIZE=8 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" +ckpt_path="" +conf_path="" +server_index="" +fix_node_ip="" +devicesnum="" + + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Bert-Squad_ID0470_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=32 +#训练step +train_steps= +#学习率 +learning_rate=2e-4 + + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --devicesnum* ]];then + devicesnum=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + fi +done + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 + + + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip +export MASTER_PORT=29688 +export HCCL_WHITELIST_DISABLE=1 +device_num=${#devicesnum} +devices_num=`awk 'BEGIN{printf "%.0f\n",'${device_num}'-1}'` + +start_time=$(date +%s) +NPUS=($(seq 0 $devices_num)) +rank_server=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${linux_num}'}'` + + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +rank=0 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + export ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3.7 run_squad.py \ + --init_checkpoint ${ckpt_path}/bert_large_pretrained_amp.pt \ + --bert_model bert-large-uncased \ + --do_train \ + --train_file ${data_path}/train-v1.1-min.json \ + --train_batch_size ${batch_size} \ + --do_predict \ + --predict_batch_size ${batch_size} \ + --predict_file ${data_path}/dev-v1.1.json \ + --learning_rate ${learning_rate} \ + --num_train_epochs ${train_epochs} \ + --seed 1 \ + --fp16 \ + --max_steps 100 \ + --use_npu \ + --loss_scale 4096 \ + --vocab_file "data/uncased_L-24_H-1024_A-16/vocab.txt" \ + --do_eval \ + --eval_script ${data_path}/evaluate-v1.1.py \ + --npu_id ${ASCEND_DEVICE_ID} \ + --do_lower_case \ + --output_dir ${cur_path}/../results \ + --config_file bert_config.json \ + --num_npu 16 \ + --local_rank=$RANK_ID \ + --addr $one_node_ip \ + --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + let rank++ +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_time=`grep 'step_time : ' $cur_path/output/0/train_0.log| awk '{print$13}'| tail -n +3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` + +FPS=`awk 'BEGIN{printf "%.2f\n", '$batch_size'/'$step_time'*16}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'F1 : ' $cur_path/output/0/train_0.log|awk '{print $10}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -r "step_loss :" $cur_path/output/0/train_0.log | awk '{print $19}' > $cur_path/output/0/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/0/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/0/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/0/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/0/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/0/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/0/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/0/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/0/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/0/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/0/${CaseName}.log +rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 +export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/dev/cv/image_classification/Hourglass_ID1809_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/Hourglass_ID1809_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000..ccc9ac9928 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/Hourglass_ID1809_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,225 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" +devicesnum="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Hourglass_ID1809_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=16 +#训练step +train_steps=10 +#学习率 +learning_rate=1e-3 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --devicesnum* ]];then + devicesnum=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + fi +done + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +sed -i "s|'batchsize': 16|'batchsize': $batch_size|g" $cur_path/../task/pose.py +sed -i "s|'learning_rate': 1e-3|'learning_rate': $learning_rate|g" $cur_path/../task/pose.py +sed -i "s|'epoch_num': 200|'epoch_num': $train_epochs|g" $cur_path/../task/pose.py +sed -i "s|'train_iters': 1000|'train_iters': $train_steps|g" $cur_path/../task/pose.py +sed -i "s|annot_dir = 'data/MPII/annot'|annot_dir = '$data_path/data/MPII/annot'|g" $cur_path/../datat/MPII/ref.py +sed -i "s|img_dir = 'data/MPII/images'|img_dir = '$data_path/data/MPII/images'|g" $cur_path/../datat/MPII/ref.py +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + #cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + #cpustep=`expr $cpucount / 8` + #echo "taskset c steps:" $cpustep + #let a=RANK_ID*$cpustep + #let b=RANK_ID+1 + #let c=b*$cpustep-1 + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +done +wait + + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip +export MASTER_PORT=29688 +export HCCL_WHITELIST_DISABLE=1 +device_num=${#devicesnum} +devices_num=`awk 'BEGIN{printf "%.0f\n",'${device_num}'-1}'` + +start_time=$(date +%s) +NPUS=($(seq 0 $devices_num)) +rank_server=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${linux_num}'}'` + +rank=0 +for i in ${NPUS[@]} +do + mkdir -p $cur_path/output/${i}/ + export NPU_CALCULATE_DEVICE=${i} + export ASCEND_DEVICE_ID=${i} + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + echo run process ${rank} + + nohup python3 train.py -e test_run_001 --ddp True > $cur_path/output/${i}/train_${i}.log 2>&1 & + let rank++ +done +wait + + +sed -i "s|'batchsize': $batch_size|'batchsize': 16|g" $cur_path/../task/pose.py +sed -i "s|'learning_rate': $learning_rate|'learning_rate': 1e-3|g" $cur_path/../task/pose.py +sed -i "s|'epoch_num': $train_epochs|'epoch_num': 200|g" $cur_path/../task/pose.py +sed -i "s|'train_iters': $train_steps|'train_iters': 1000|g" $cur_path/../task/pose.py +sed -i "s|annot_dir = '$data_path/data/MPII/annot'|annot_dir = 'data/MPII/annot'|g" $cur_path/../datat/MPII/ref.py +sed -i "s|img_dir = '$data_path/data/MPII/images'|img_dir = 'data/MPII/images'|g" $cur_path/../datat/MPII/ref.py +#conda deactivate +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +sed -i "s|\r|\n|g" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep "fps:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "fps: " '{print $2}'|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` +FPS=$(awk 'BEGIN{print '$FPS'*16}') + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'| sed 's/,//g' |cut -c 1-5` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "the loss is: " $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "the loss is: " '{print $2}'|sed s/[[:space:]]//g > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From cbc3d60000a23779e995be0098b0f70f504b4182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9B=BD=E5=BA=86?= <3224925783@qq.com> Date: Mon, 25 Apr 2022 10:48:01 +0000 Subject: [PATCH 2/6] add PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh. --- .../test/train_performance_16p.sh | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh diff --git a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000..ad5b68f395 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,213 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#source env.sh +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" +devicesnum="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ResNet152_ID0424_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=4096 +#训练step +#train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.5 + +#TF2.X独有,需要模型审视修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --devicesnum* ]];then + devicesnum=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + fi +done + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip +export MASTER_PORT=29688 +export HCCL_WHITELIST_DISABLE=1 +device_num=${#devicesnum} +devices_num=`awk 'BEGIN{printf "%.0f\n",'${device_num}'-1}'` + +NPUS=($(seq 0 $devices_num)) +rank_server=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${linux_num}'}'` + + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/.. +rank=0 +for((RANK_ID=$RANK_ID_START;RANK_ID<1;RANK_ID++)); +do + #设置环境变量,不需要修改 + ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $ASCEND_DEVICE_ID" + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + #let a=RANK_ID*12 + #let b=RANK_ID+1 + #let c=b*12-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3 ${cur_path}/../main.py \ + ${data_path} \ + -a resnet152 \ + --addr=$one_node_ip \ + --seed=49 \ + --workers=$(nproc) \ + --learning-rate=${learning_rate} \ + --mom=0.9 \ + --weight-decay=1.0e-04 \ + --print-freq=1 \ + --dist-url='tcp://127.0.0.1:50000' \ + --multiprocessing-distributed \ + --world-size=2 \ + --rank=0 \ + --device='npu' \ + --dist-backend='hccl' \ + --epochs=${train_epochs} \ + --batch-size=${batch_size} \ + --amp \ + --device_list=0,1,2,3,4,5,6,7 \ + --FusedSGD \ + --stop-step-num=128 \ + --loss-scale=1024 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +fps=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '$fps'*16}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 05d674ae75221447b23531275fcdf3e43e79bc96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9B=BD=E5=BA=86?= <3224925783@qq.com> Date: Mon, 25 Apr 2022 10:50:23 +0000 Subject: [PATCH 3/6] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20PyTo?= =?UTF-8?q?rch/dev/cv/image=5Fclassification/ResNet152=5FID0424=5Ffor=5FPy?= =?UTF-8?q?Torch/test/train=5Fperformance=5F16p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_performance_16p.sh | 213 ------------------ 1 file changed, 213 deletions(-) delete mode 100644 PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh diff --git a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh deleted file mode 100644 index ad5b68f395..0000000000 --- a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh +++ /dev/null @@ -1,213 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` -#source env.sh -#集合通信参数,不需要修改 - -export RANK_SIZE=8 -export JOB_ID=10087 -RANK_ID_START=0 - - -# 数据集路径,保持为空,不需要修改 -data_path="" -conf_path="" -server_index="" -fix_node_ip="" -devicesnum="" - -#设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="ResNet152_ID0424_for_PyTorch" -#训练epoch -train_epochs=1 -#训练batch_size -batch_size=4096 -#训练step -#train_steps=`expr 1281167 / ${batch_size}` -#学习率 -learning_rate=0.5 - -#TF2.X独有,需要模型审视修改 -#export NPU_LOOP_SIZE=${train_steps} - -#维测参数,precision_mode需要模型审视修改 -#precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --fix_node_ip* ]];then - fix_node_ip=`echo ${para#*=}` - elif [[ $para == --devicesnum* ]];then - devicesnum=`echo ${para#*=}` - elif [[ $para == --conf_path* ]];then - conf_path=`echo ${para#*=}` - elif [[ $para == --server_index* ]];then - server_index=`echo ${para#*=}` - fi -done - -one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -linux_num=`find $conf_path -name "server_*.info" |wc -l` - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -export HCCL_IF_IP=$fix_node_ip -export MASTER_ADDR=$one_node_ip -export MASTER_PORT=29688 -export HCCL_WHITELIST_DISABLE=1 -device_num=${#devicesnum} -devices_num=`awk 'BEGIN{printf "%.0f\n",'${device_num}'-1}'` - -NPUS=($(seq 0 $devices_num)) -rank_server=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${server_index}'}'` -export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${linux_num}'}'` - - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/.. -rank=0 -for((RANK_ID=$RANK_ID_START;RANK_ID<1;RANK_ID++)); -do - #设置环境变量,不需要修改 - ASCEND_DEVICE_ID=$RANK_ID - echo "Device ID: $ASCEND_DEVICE_ID" - - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - # 绑核,不需要的绑核的模型删除,需要的模型审视修改 - #let a=RANK_ID*12 - #let b=RANK_ID+1 - #let c=b*12-1 - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - nohup python3 ${cur_path}/../main.py \ - ${data_path} \ - -a resnet152 \ - --addr=$one_node_ip \ - --seed=49 \ - --workers=$(nproc) \ - --learning-rate=${learning_rate} \ - --mom=0.9 \ - --weight-decay=1.0e-04 \ - --print-freq=1 \ - --dist-url='tcp://127.0.0.1:50000' \ - --multiprocessing-distributed \ - --world-size=2 \ - --rank=0 \ - --device='npu' \ - --dist-backend='hccl' \ - --epochs=${train_epochs} \ - --batch-size=${batch_size} \ - --amp \ - --device_list=0,1,2,3,4,5,6,7 \ - --FusedSGD \ - --stop-step-num=128 \ - --loss-scale=1024 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -fps=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` -FPS=`awk 'BEGIN{printf "%.2f\n", '$fps'*16}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep Epoch: $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 635ba7ffd5a2b57b78298b3af359c9d8fa1ab2e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9B=BD=E5=BA=86?= <3224925783@qq.com> Date: Mon, 25 Apr 2022 12:23:30 +0000 Subject: [PATCH 4/6] add ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh. --- .../test/train_performance_16p.sh | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh diff --git a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000..6debce121d --- /dev/null +++ b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,213 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#source env.sh +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" +devicesnum="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ResNet152_ID0424_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=4096 +#训练step +#train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.5 + +#TF2.X独有,需要模型审视修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --devicesnum* ]];then + devicesnum=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + fi +done + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip +export MASTER_PORT=29688 +export HCCL_WHITELIST_DISABLE=1 +device_num=${#devicesnum} +devices_num=`awk 'BEGIN{printf "%.0f\n",'${device_num}'-1}'` + +NPUS=($(seq 0 $devices_num)) +rank_server=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${linux_num}'}'` + + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/.. +rank=0 +for((RANK_ID=$RANK_ID_START;RANK_ID<1;RANK_ID++)); +do + #设置环境变量,不需要修改 + ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $ASCEND_DEVICE_ID" + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + #let a=RANK_ID*12 + #let b=RANK_ID+1 + #let c=b*12-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3 ${cur_path}/../main.py \ + ${data_path} \ + -a resnet152 \ + --addr=$one_node_ip \ + --seed=49 \ + --workers=$(nproc) \ + --learning-rate=${learning_rate} \ + --mom=0.9 \ + --weight-decay=1.0e-04 \ + --print-freq=1 \ + --dist-url='tcp://127.0.0.1:50000' \ + --multiprocessing-distributed \ + --world-size=2 \ + --rank=${server_index} \ + --device='npu' \ + --dist-backend='hccl' \ + --epochs=${train_epochs} \ + --batch-size=${batch_size} \ + --amp \ + --device_list=0,1,2,3,4,5,6,7 \ + --FusedSGD \ + --stop-step-num=128 \ + --loss-scale=1024 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +fps=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '$fps'*16}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 995181a02460f002b56ab80f950e6cf5f6bcc93b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9B=BD=E5=BA=86?= <3224925783@qq.com> Date: Mon, 25 Apr 2022 12:49:04 +0000 Subject: [PATCH 5/6] update ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh. --- .../ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh index 6debce121d..c02d307296 100644 --- a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh @@ -172,8 +172,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -fps=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` -FPS=`awk 'BEGIN{printf "%.2f\n", '$fps'*16}'` +FPS=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" -- Gitee From c6e681ce4dc2abbf0a8bed49c62c13e15ba52ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9B=BD=E5=BA=86?= <3224925783@qq.com> Date: Tue, 26 Apr 2022 01:16:23 +0000 Subject: [PATCH 6/6] update ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh. --- .../ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh index c02d307296..e713468687 100644 --- a/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/dev/cv/image_classification/ResNet152_ID0424_for_PyTorch/test/train_performance_16p.sh @@ -172,7 +172,8 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -FPS=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +fps=`grep -a 'FPS' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|awk 'END {print}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${fps}'*2}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" -- Gitee