diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh index b3d8d0e87c2e8ae15c584ef62a571776a4525931..73da94968b8204ed880971525603e959f749d0aa 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_1p.sh @@ -1,22 +1,22 @@ #!/bin/bash -################基础配置参数,需要模型审视修改################## -# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE -# 网络名称,同目录名称 +################鍩虹閰嶇疆鍙傛暟锛岄渶瑕佹ā鍨嬪瑙嗕慨鏀################## +# 蹇呴夊瓧娈(蹇呴』鍦ㄦ澶勫畾涔夌殑鍙傛暟): Network batch_size RANK_SIZE +# 缃戠粶鍚嶇О锛屽悓鐩綍鍚嶇О Network="ResNet50_ID4149_for_PyTorch" -# 训练batch_size +# 璁粌batch_size batch_size=512 -# 训练使用的npu卡数 +# 璁粌浣跨敤鐨刵pu鍗℃暟 export RANK_SIZE=1 -# 数据集路径,保持为空,不需要修改 +# 鏁版嵁闆嗚矾寰,淇濇寔涓虹┖,涓嶉渶瑕佷慨鏀 data_path="" -# 训练epoch 90 +# 璁粌epoch 90 train_epochs=1 -# 加载数据进程数 -workers=64 +# 鍔犺浇鏁版嵁杩涚▼鏁 +workers=24 device_id=0 -# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +# 鍙傛暟鏍¢獙锛宒ata_path涓哄繀浼犲弬鏁帮紝鍏朵粬鍙傛暟鐨勫鍒犵敱妯″瀷鑷韩鍐冲畾锛涙澶勬柊澧炲弬鏁伴渶鍦ㄤ笂闈㈡湁瀹氫箟骞惰祴鍊 for para in $* do if [[ $para == --data_path* ]];then @@ -28,14 +28,14 @@ do fi done -# 校验是否传入data_path,不需要修改 +# 鏍¢獙鏄惁浼犲叆data_path,涓嶉渶瑕佷慨鏀 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi -###############指定训练脚本执行路径############### -# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +###############鎸囧畾璁粌鑴氭湰鎵ц璺緞############### +# cd鍒颁笌test鏂囦欢澶瑰悓灞傜骇鐩綍涓嬫墽琛岃剼鏈紝鎻愰珮鍏煎鎬э紱test_path_dir涓哄寘鍚玹est鏂囦欢澶圭殑璺緞 cur_path=`pwd` cur_path_last_dirname=${cur_path##*/} if [ x"${cur_path_last_dirname}" == x"test" ];then @@ -46,10 +46,10 @@ else test_path_dir=${cur_path}/test fi -# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +# 鏍¢獙鏄惁鎸囧畾浜哾evice_id,鍒嗗姩鎬佸垎閰峝evice_id涓庢墜鍔ㄦ寚瀹歞evice_id,姝ゅ涓嶉渶瑕佷慨鏀 if [ $ASCEND_DEVICE_ID ];then echo "device id is ${ASCEND_DEVICE_ID}" - # 平台运行软链数据集路径 + # 骞冲彴杩愯杞摼鏁版嵁闆嗚矾寰 elif [ ${device_id} ];then export ASCEND_DEVICE_ID=${device_id} echo "device id is ${ASCEND_DEVICE_ID}" @@ -58,7 +58,7 @@ else exit 1 fi -#################创建日志输出目录,不需要修改################# +#################鍒涘缓鏃ュ織杈撳嚭鐩綍锛屼笉闇瑕佷慨鏀################# if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID @@ -67,10 +67,10 @@ else fi -#################启动训练脚本################# -# 训练开始时间,不需要修改 +#################鍚姩璁粌鑴氭湰################# +# 璁粌寮濮嬫椂闂达紝涓嶉渶瑕佷慨鏀 start_time=$(date +%s) -# 非平台场景时source 环境变量 +# 闈炲钩鍙板満鏅椂source 鐜鍙橀噺 check_etp_flag=`env | grep etp_running_flag` etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then @@ -78,56 +78,56 @@ if [ x"${etp_flag}" != x"true" ];then fi nohup python3.7 main.py \ - --data $data_path \ - --amp \ - --world-size 1 \ - --seed 60 \ - -a resnet50 \ - -j $workers \ - -b $batch_size \ - --lr 0.2 \ - --epochs $train_epochs \ - --gpu $device_id > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --data $data_path \ + --amp \ + --world-size 1 \ + --seed 60 \ + -a resnet50 \ + -j $workers \ + -b $batch_size \ + --lr 0.2 \ + --epochs $train_epochs \ + --gpu $device_id > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait -##################获取训练数据################ -# 训练结束时间,不需要修改 +##################鑾峰彇璁粌鏁版嵁################ +# 璁粌缁撴潫鏃堕棿锛屼笉闇瑕佷慨鏀 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -# 结果打印,不需要修改 +# 缁撴灉鎵撳嵃锛屼笉闇瑕佷慨鏀 echo "------------------ Final result ------------------" -# 输出性能FPS,需要模型审视修改 +# 杈撳嚭鎬ц兘FPS锛岄渶瑕佹ā鍨嬪瑙嗕慨鏀 step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 100 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` FPS=`echo "${batch_size} / ${step_time}"|bc` -# 打印,不需要修改 +# 鎵撳嵃锛屼笉闇瑕佷慨鏀 echo "Final Performance images/sec : $FPS" CompileTime=`grep step_time ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| head -2 |awk -F "step_time = " '{print $2}' | awk '{sum+=$1} END {print"",sum}' |sed s/[[:space:]]//g` -# 输出训练精度,需要模型审视修改 +# 杈撳嚭璁粌绮惧害,闇瑕佹ā鍨嬪瑙嗕慨鏀 train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1 " '{print $NF}'|awk -F " " '{print $1}'` -# 打印,不需要修改 +# 鎵撳嵃锛屼笉闇瑕佷慨鏀 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" -# 训练用例信息,不需要修改 +# 璁粌鐢ㄤ緥淇℃伅锛屼笉闇瑕佷慨鏀 BatchSize=${batch_size} DeviceType=`uname -m` CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' -# 吞吐量 +# 鍚炲悙閲 ActualFPS=${FPS} -# 单迭代训练时长 +# 鍗曡凯浠h缁冩椂闀 TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` -# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +# 浠巘rain_$ASCEND_DEVICE_ID.log鎻愬彇Loss鍒皌rain_${CaseName}_loss.txt涓紝闇瑕佹牴鎹ā鍨嬪瑙 grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -# 最后一个迭代loss值,不需要修改 +# 鏈鍚庝竴涓凯浠oss鍊硷紝涓嶉渶瑕佷慨鏀 ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -# 关键信息打印到${CaseName}.log中,不需要修改 +# 鍏抽敭淇℃伅鎵撳嵃鍒${CaseName}.log涓紝涓嶉渶瑕佷慨鏀 echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_2p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_2p.sh new file mode 100644 index 0000000000000000000000000000000000000000..9cc73a28df082b4d024f837e8bad7c2689507fb3 --- /dev/null +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_2p.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +################鍩虹閰嶇疆鍙傛暟锛岄渶瑕佹ā鍨嬪瑙嗕慨鏀################## +# 蹇呴夊瓧娈(蹇呴』鍦ㄦ澶勫畾涔夌殑鍙傛暟): Network batch_size RANK_SIZE +# 缃戠粶鍚嶇О锛屽悓鐩綍鍚嶇О +Network="ResNet50_ID4149_for_PyTorch" +# 璁粌batch_size +batch_size=1024 +# 璁粌浣跨敤鐨刵pu鍗℃暟 +export RANK_SIZE=2 +# 鏁版嵁闆嗚矾寰,淇濇寔涓虹┖,涓嶉渶瑕佷慨鏀 +data_path="" + +# 璁粌epoch 90 +train_epochs=2 +# 鍔犺浇鏁版嵁杩涚▼鏁 +workers=48 +device_id=0 +# 鍙傛暟鏍¢獙锛宒ata_path涓哄繀浼犲弬鏁帮紝鍏朵粬鍙傛暟鐨勫鍒犵敱妯″瀷鑷韩鍐冲畾锛涙澶勬柊澧炲弬鏁伴渶鍦ㄤ笂闈㈡湁瀹氫箟骞惰祴鍊 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + elif [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + fi +done + +# 鏍¢獙鏄惁浼犲叆data_path,涓嶉渶瑕佷慨鏀 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############鎸囧畾璁粌鑴氭湰鎵ц璺緞############### +# cd鍒颁笌test鏂囦欢澶瑰悓灞傜骇鐩綍涓嬫墽琛岃剼鏈紝鎻愰珮鍏煎鎬э紱test_path_dir涓哄寘鍚玹est鏂囦欢澶圭殑璺緞 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +# 鏍¢獙鏄惁鎸囧畾浜哾evice_id,鍒嗗姩鎬佸垎閰峝evice_id涓庢墜鍔ㄦ寚瀹歞evice_id,姝ゅ涓嶉渶瑕佷慨鏀 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" + # 骞冲彴杩愯杞摼鏁版嵁闆嗚矾寰 +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#################鍒涘缓鏃ュ織杈撳嚭鐩綍锛屼笉闇瑕佷慨鏀################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################鍚姩璁粌鑴氭湰################# +# 璁粌寮濮嬫椂闂达紝涓嶉渶瑕佷慨鏀 +start_time=$(date +%s) +# 闈炲钩鍙板満鏅椂source 鐜鍙橀噺 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +KERNEL_NUM=$(($(nproc)/2)) +for((RANK_ID=0;RANK_ID<2;RANK_ID++)); +do + export RANK_SIZE=2 + export RANK_ID=$RANK_ID + if [ $(uname -m) = 'aarch64' ] + then + PID_START=$((KERNEL_NUM * RANK_ID)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + nohup taskset -c $PID_START-$PID_END python3.7 main.py \ + --data $data_path \ + --amp \ + --world-size 1 \ + --seed 60 \ + -a resnet50 \ + -j $workers \ + -b $batch_size \ + --lr 1.6 \ + --epochs $train_epochs \ + --gpu ${RANK_ID} \ + --rank 0 \ + --multiprocessing-distributed > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + else + nohup python3.7 main.py \ + --data $data_path \ + --amp \ + --world-size 1 \ + --seed 60 \ + -a resnet50 \ + -j $workers \ + -b $batch_size \ + --lr 1.6 \ + --epochs $train_epochs \ + --gpu ${RANK_ID} \ + --rank 0 \ + --multiprocessing-distributed > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + fi +done +wait + + +##################鑾峰彇璁粌鏁版嵁################ +# 璁粌缁撴潫鏃堕棿锛屼笉闇瑕佷慨鏀 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 缁撴灉鎵撳嵃锛屼笉闇瑕佷慨鏀 +echo "------------------ Final result ------------------" +# 杈撳嚭鎬ц兘FPS锛岄渶瑕佹ā鍨嬪瑙嗕慨鏀 +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 100 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +FPS=`echo "${batch_size} / ${step_time}"|bc` +# 鎵撳嵃锛屼笉闇瑕佷慨鏀 +echo "Final Performance images/sec : $FPS" + +CompileTime=`grep step_time ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| head -2 |awk -F "step_time = " '{print $2}' | awk '{sum+=$1} END {print"",sum}' |sed s/[[:space:]]//g` + +# 杈撳嚭璁粌绮惧害,闇瑕佹ā鍨嬪瑙嗕慨鏀 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1 " '{print $NF}'|awk -F " " '{print $1}'` +# 鎵撳嵃锛屼笉闇瑕佷慨鏀 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +# 璁粌鐢ㄤ緥淇℃伅锛屼笉闇瑕佷慨鏀 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +# 鍚炲悙閲 +ActualFPS=${FPS} +# 鍗曡凯浠h缁冩椂闀 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 浠巘rain_$ASCEND_DEVICE_ID.log鎻愬彇Loss鍒皌rain_${CaseName}_loss.txt涓紝闇瑕佹牴鎹ā鍨嬪瑙 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +# 鏈鍚庝竴涓凯浠oss鍊硷紝涓嶉渶瑕佷慨鏀 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +# 鍏抽敭淇℃伅鎵撳嵃鍒${CaseName}.log涓紝涓嶉渶瑕佷慨鏀 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CompileTime = ${CompileTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_4p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_4p.sh new file mode 100644 index 0000000000000000000000000000000000000000..8893c4be3a710b7b73fbf017f9ec12d10db485ed --- /dev/null +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_4p.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +################鍩虹閰嶇疆鍙傛暟锛岄渶瑕佹ā鍨嬪瑙嗕慨鏀################## +# 蹇呴夊瓧娈(蹇呴』鍦ㄦ澶勫畾涔夌殑鍙傛暟): Network batch_size RANK_SIZE +# 缃戠粶鍚嶇О锛屽悓鐩綍鍚嶇О +Network="ResNet50_ID4149_for_PyTorch" +# 璁粌batch_size +batch_size=2048 +# 璁粌浣跨敤鐨刵pu鍗℃暟 +export RANK_SIZE=4 +# 鏁版嵁闆嗚矾寰,淇濇寔涓虹┖,涓嶉渶瑕佷慨鏀 +data_path="" + +# 璁粌epoch 90 +train_epochs=2 +# 鍔犺浇鏁版嵁杩涚▼鏁 +workers=96 +device_id=0 +# 鍙傛暟鏍¢獙锛宒ata_path涓哄繀浼犲弬鏁帮紝鍏朵粬鍙傛暟鐨勫鍒犵敱妯″瀷鑷韩鍐冲畾锛涙澶勬柊澧炲弬鏁伴渶鍦ㄤ笂闈㈡湁瀹氫箟骞惰祴鍊 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + elif [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + fi +done + +# 鏍¢獙鏄惁浼犲叆data_path,涓嶉渶瑕佷慨鏀 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############鎸囧畾璁粌鑴氭湰鎵ц璺緞############### +# cd鍒颁笌test鏂囦欢澶瑰悓灞傜骇鐩綍涓嬫墽琛岃剼鏈紝鎻愰珮鍏煎鎬э紱test_path_dir涓哄寘鍚玹est鏂囦欢澶圭殑璺緞 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +# 鏍¢獙鏄惁鎸囧畾浜哾evice_id,鍒嗗姩鎬佸垎閰峝evice_id涓庢墜鍔ㄦ寚瀹歞evice_id,姝ゅ涓嶉渶瑕佷慨鏀 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" + # 骞冲彴杩愯杞摼鏁版嵁闆嗚矾寰 +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#################鍒涘缓鏃ュ織杈撳嚭鐩綍锛屼笉闇瑕佷慨鏀################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################鍚姩璁粌鑴氭湰################# +# 璁粌寮濮嬫椂闂达紝涓嶉渶瑕佷慨鏀 +start_time=$(date +%s) +# 闈炲钩鍙板満鏅椂source 鐜鍙橀噺 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +KERNEL_NUM=$(($(nproc)/4)) +for((RANK_ID=0;RANK_ID<4;RANK_ID++)); +do + export RANK_SIZE=4 + export RANK_ID=$RANK_ID + if [ $(uname -m) = 'aarch64' ] + then + PID_START=$((KERNEL_NUM * RANK_ID)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + nohup taskset -c $PID_START-$PID_END python3.7 main.py \ + --data $data_path \ + --amp \ + --world-size 1 \ + --seed 60 \ + -a resnet50 \ + -j $workers \ + -b $batch_size \ + --lr 1.6 \ + --epochs $train_epochs \ + --gpu ${RANK_ID} \ + --rank 0 \ + --multiprocessing-distributed > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + else + nohup python3.7 main.py \ + --data $data_path \ + --amp \ + --world-size 1 \ + --seed 60 \ + -a resnet50 \ + -j $workers \ + -b $batch_size \ + --lr 1.6 \ + --epochs $train_epochs \ + --gpu ${RANK_ID} \ + --rank 0 \ + --multiprocessing-distributed > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + fi +done +wait + + +##################鑾峰彇璁粌鏁版嵁################ +# 璁粌缁撴潫鏃堕棿锛屼笉闇瑕佷慨鏀 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 缁撴灉鎵撳嵃锛屼笉闇瑕佷慨鏀 +echo "------------------ Final result ------------------" +# 杈撳嚭鎬ц兘FPS锛岄渶瑕佹ā鍨嬪瑙嗕慨鏀 +step_time=`grep "Epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F "Time " '{print $2}' | awk -F " " '{print $1}' | tail -n 100 | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +FPS=`echo "${batch_size} / ${step_time}"|bc` +# 鎵撳嵃锛屼笉闇瑕佷慨鏀 +echo "Final Performance images/sec : $FPS" + +CompileTime=`grep step_time ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| head -2 |awk -F "step_time = " '{print $2}' | awk '{sum+=$1} END {print"",sum}' |sed s/[[:space:]]//g` + +# 杈撳嚭璁粌绮惧害,闇瑕佹ā鍨嬪瑙嗕慨鏀 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1 " '{print $NF}'|awk -F " " '{print $1}'` +# 鎵撳嵃锛屼笉闇瑕佷慨鏀 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +# 璁粌鐢ㄤ緥淇℃伅锛屼笉闇瑕佷慨鏀 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +# 鍚炲悙閲 +ActualFPS=${FPS} +# 鍗曡凯浠h缁冩椂闀 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 浠巘rain_$ASCEND_DEVICE_ID.log鎻愬彇Loss鍒皌rain_${CaseName}_loss.txt涓紝闇瑕佹牴鎹ā鍨嬪瑙 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +# 鏈鍚庝竴涓凯浠oss鍊硷紝涓嶉渶瑕佷慨鏀 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +# 鍏抽敭淇℃伅鎵撳嵃鍒${CaseName}.log涓紝涓嶉渶瑕佷慨鏀 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CompileTime = ${CompileTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh index d21159916e2bb9cfa7c7ad36a5234f1a48ac621c..0f259328c8890b63bac05668ee50b5d5de78a1c7 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_ID4149_for_PyTorch/test/train_performance_8p.sh @@ -14,7 +14,7 @@ data_path="" # 璁粌epoch 90 train_epochs=2 # 鍔犺浇鏁版嵁杩涚▼鏁 -workers=256 +workers=192 device_id=0 # 鍙傛暟鏍¢獙锛宒ata_path涓哄繀浼犲弬鏁帮紝鍏朵粬鍙傛暟鐨勫鍒犵敱妯″瀷鑷韩鍐冲畾锛涙澶勬柊澧炲弬鏁伴渶鍦ㄤ笂闈㈡湁瀹氫箟骞惰祴鍊 for para in $*