From 6a5786c699bd3bc13a8c1eeb6580be1e61d72a52 Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Thu, 14 Nov 2024 07:18:38 +0000 Subject: [PATCH] 11/14/2024 07:18:38 sync TensorFlow file --- .../inceptionv3/tensorflow/get_num_devices.sh | 6 +- .../run_train_distributed_imagenette.sh | 67 +++++++++++-------- .../run_train_inception3_imagenette.sh | 22 +++--- ...un_train_inception3_multigpu_imagenette.sh | 23 ++++--- .../resnet50/tensorflow/benchmark_cnn.py | 5 ++ .../run_train_distributed_imagenette.sh | 6 +- .../vgg/tensorflow/get_num_devices.sh | 9 +-- .../run_train_distributed_imagenette.sh | 65 ++++++++++-------- .../tensorflow/run_train_vgg16_imagenette.sh | 26 ++++--- .../run_train_vgg16_multigpu_imagenette.sh | 26 ++++--- .../bert/tensorflow/base/README.md | 24 +++++-- .../bert/tensorflow/base/init_tf.sh | 5 +- .../bert/tensorflow/base/optimization.py | 2 +- .../bert/tensorflow/base/run_1card_FPS.sh | 25 +++---- .../tensorflow/base/run_multi_card_FPS.sh | 16 ++--- .../bert/tensorflow/base/run_pretraining.py | 2 +- 16 files changed, 187 insertions(+), 142 deletions(-) diff --git a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh index 7c6036a71..a9c370895 100644 --- a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh +++ b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -19,8 +19,8 @@ if [ -n "$devices" ]; then _devices=(${devices//,/ }) num_devices=${#_devices[@]} else - num_devices=8 - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + num_devices=2 + export CUDA_VISIBLE_DEVICES=0,1 echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}" fi export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} diff --git a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh index 1787fb081..1abf5d2c0 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh @@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1 i=0 model="alexnet" -for arg in "$@"; do +for arg in "$@" +do if [ $i -eq 0 ]; then model=$arg let i++ @@ -40,11 +41,12 @@ for arg in "$@"; do done echo "## Training model: ${model}" + : ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/${model}_distributed" DATA_DIR=./imagenette @@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi @@ -66,7 +69,7 @@ check_status() { # Prepare devices ################################################# devices=$CUDA_VISIBLE_DEVICES -if [ -n "$devices" ]; then +if [ -n "$devices" ]; then devices=(${devices//,/ }) num_devices=${#devices[@]} else @@ -86,7 +89,8 @@ fi ################################################# worker_hosts="" i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do if [ "$i" == "0" ]; then let i++ continue @@ -102,13 +106,13 @@ echo "worker_hosts: ${worker_hosts}" ################################################# trap ctrl_c INT function ctrl_c() { - echo "*** Trapped CTRL-C, killing process running background" - for pid in "${pid_list[@]}"; do - echo "Killing pid ${pid}" - kill ${pid} - wait ${pid} - done - exit 0 + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + wait ${pid} + done + exit 0 } ################################################# @@ -116,9 +120,10 @@ function ctrl_c() { ################################################# pid_list=() -last_device=$(expr ${num_devices} - 1) +last_device=`expr ${num_devices} - 1` i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do job_name="worker" if [ "${i}" == "0" ]; then job_name="ps" @@ -127,26 +132,30 @@ for device in "${devices[@]}"; do if [ ${i} -le 1 ]; then task_index=0 else - task_index=$(expr ${i} - 1) + task_index=`expr ${i} - 1` fi if [ "${i}" == "${last_device}" ]; then - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated \ - --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log - [[ ${PIPESTATUS[0]} == 0 ]] || exit + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" else - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model}\ + --variable_update=distributed_replicated\ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" fi let i++ diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh index 12337021f..9c44c920f 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=rmsprop -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/inception3" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -51,10 +53,14 @@ for arg in "$@"; do let i++ done -python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1 --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit exit ${EXIT_STATUS} diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh index c48d0cbfd..6a8938ec5 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=rmsprop -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/inception3_multigpu" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -53,11 +55,14 @@ done source ./get_num_devices.sh -UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit exit ${EXIT_STATUS} diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn.py b/cv/classification/resnet50/tensorflow/benchmark_cnn.py index 6f65ea69b..7f6c1db4c 100644 --- a/cv/classification/resnet50/tensorflow/benchmark_cnn.py +++ b/cv/classification/resnet50/tensorflow/benchmark_cnn.py @@ -31,9 +31,11 @@ import re import threading import time import traceback +import sys from absl import flags as absl_flags import numpy as np +import math import six from six.moves import xrange # pylint: disable=redefined-builtin @@ -881,6 +883,9 @@ def benchmark_one_step(sess, lossval = results['average_loss'] else: lossval = 0. + if not math.isfinite(lossval): + print("Loss is {}, stopping training".format(lossval)) + sys.exit(1) if image_producer is not None: image_producer.notify_image_consumption() train_time = time.time() - start_time diff --git a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh index 98639e5c9..f4f48223c 100644 --- a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,7 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. - bash ./get_imagenette.sh export TF_CUDNN_USE_AUTOTUNE=1 @@ -43,7 +42,7 @@ done echo "## Training model: ${model}" -: ${BATCH_SIZE:=32} +: ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum @@ -111,6 +110,7 @@ function ctrl_c() { for pid in "${pid_list[@]}"; do echo "Killing pid ${pid}" kill ${pid} + wait ${pid} done exit 0 } diff --git a/cv/classification/vgg/tensorflow/get_num_devices.sh b/cv/classification/vgg/tensorflow/get_num_devices.sh index 14d6c0a5a..a9c370895 100644 --- a/cv/classification/vgg/tensorflow/get_num_devices.sh +++ b/cv/classification/vgg/tensorflow/get_num_devices.sh @@ -1,4 +1,5 @@ -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +#!/bin/bash +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -18,8 +19,8 @@ if [ -n "$devices" ]; then _devices=(${devices//,/ }) num_devices=${#_devices[@]} else - num_devices=8 - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + num_devices=2 + export CUDA_VISIBLE_DEVICES=0,1 echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}" fi -export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} \ No newline at end of file +export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} diff --git a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh index 36b41ee85..c7642b986 100644 --- a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh @@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1 i=0 model="alexnet" -for arg in "$@"; do +for arg in "$@" +do if [ $i -eq 0 ]; then model=$arg let i++ @@ -40,11 +41,12 @@ for arg in "$@"; do done echo "## Training model: ${model}" + : ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/${model}_distributed" DATA_DIR=./imagenette @@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi @@ -66,7 +69,7 @@ check_status() { # Prepare devices ################################################# devices=$CUDA_VISIBLE_DEVICES -if [ -n "$devices" ]; then +if [ -n "$devices" ]; then devices=(${devices//,/ }) num_devices=${#devices[@]} else @@ -86,7 +89,8 @@ fi ################################################# worker_hosts="" i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do if [ "$i" == "0" ]; then let i++ continue @@ -102,12 +106,12 @@ echo "worker_hosts: ${worker_hosts}" ################################################# trap ctrl_c INT function ctrl_c() { - echo "*** Trapped CTRL-C, killing process running background" - for pid in "${pid_list[@]}"; do - echo "Killing pid ${pid}" - kill ${pid} - done - exit 0 + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + done + exit 0 } ################################################# @@ -115,9 +119,10 @@ function ctrl_c() { ################################################# pid_list=() -last_device=$(expr ${num_devices} - 1) +last_device=`expr ${num_devices} - 1` i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do job_name="worker" if [ "${i}" == "0" ]; then job_name="ps" @@ -126,26 +131,30 @@ for device in "${devices[@]}"; do if [ ${i} -le 1 ]; then task_index=0 else - task_index=$(expr ${i} - 1) + task_index=`expr ${i} - 1` fi if [ "${i}" == "${last_device}" ]; then - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated \ - --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log - [[ ${PIPESTATUS[0]} == 0 ]] || exit + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" else - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model}\ + --variable_update=distributed_replicated\ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" fi let i++ diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh index aeea001bc..343dbe372 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/vgg16" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -51,13 +53,15 @@ for arg in "$@"; do let i++ done -python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 \ - --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \ - --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + exit ${EXIT_STATUS} diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh index 4de70ff72..d4c2bcd2a 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/vgg16_multigpu" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -53,13 +55,15 @@ done source ./get_num_devices.sh -UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \ - --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \ - --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + exit ${EXIT_STATUS} diff --git a/nlp/language_model/bert/tensorflow/base/README.md b/nlp/language_model/bert/tensorflow/base/README.md index a77ae4cec..42ae860eb 100644 --- a/nlp/language_model/bert/tensorflow/base/README.md +++ b/nlp/language_model/bert/tensorflow/base/README.md @@ -1,4 +1,4 @@ -# BERT Pretraining + # BERT Pretraining ## Model description BERT, or Bidirectional Encoder Representations from Transformers, improves upon standard Transformers by removing the unidirectionality constraint by using a masked language model (MLM) pre-training objective. The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked word based only on its context. Unlike left-to-right language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pre-train a deep bidirectional Transformer. In addition to the masked language model, BERT uses a next sentence prediction task that jointly pre-trains text-pair representations. @@ -9,6 +9,12 @@ BERT, or Bidirectional Encoder Representations from Transformers, improves upon ```shell bash init_tf.sh +wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.7.tar.gz +tar xf openmpi-4.0.7.tar.gz +cd openmpi-4.0.7/ +./configure --prefix=/usr/local/bin --with-orte +make -j4 && make install +export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH ``` ### Download datasets @@ -34,16 +40,26 @@ tips: you can git clone this repo in other place ,we need the bert_pretrain_tf_r ### Training on single card ```shell -bash run_1card_FPS.sh +bash run_1card_FPS.sh --input_files_dir=/path/to/bert_pretrain_tf_records/train_data \ + --init_checkpoint=/path/to/bert_pretrain_ckpt_tf/model.ckpt-28252 \ + --eval_files_dir=/path/to/bert_pretrain_tf_records/eval_data \ + --train_batch_size=6 \ + --bert_config_file=/path/to/bert_pretrain_ckpt_tf/bert_config.json ``` ### Training on mutil-cards ```shell -bash run_multi_card_FPS.sh +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export IX_NUM_CUDA_VISIBLE_DEVICES=8 +bash run_multi_card_FPS.sh --input_files_dir=/path/to/bert_pretrain_tf_records/train_data \ + --init_checkpoint=/path/to/bert_pretrain_ckpt_tf/model.ckpt-28252 \ + --eval_files_dir=/path/to/bert_pretrain_tf_records/eval_data \ + --train_batch_size=6 \ + --bert_config_file=/path/to/bert_pretrain_ckpt_tf/bert_config.json ``` ## Result | | acc | fps | | --- | --- | --- | -| multi_card | 0.424126 | 0.267241| +| multi_card | 0.424126 | 0.267241| \ No newline at end of file diff --git a/nlp/language_model/bert/tensorflow/base/init_tf.sh b/nlp/language_model/bert/tensorflow/base/init_tf.sh index 08f27c35b..79e2ae63b 100644 --- a/nlp/language_model/bert/tensorflow/base/init_tf.sh +++ b/nlp/language_model/bert/tensorflow/base/init_tf.sh @@ -13,7 +13,8 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - +pip3 uninstall -y protobuf +pip3 install "protobuf<4.0.0" pip3 install git+https://github.com/mlperf/logging.git pip3 install git+https://github.com/NVIDIA/dllogger.git -pip3 install pandas==0.24 \ No newline at end of file +pip3 install pandas==1.3.5 \ No newline at end of file diff --git a/nlp/language_model/bert/tensorflow/base/optimization.py b/nlp/language_model/bert/tensorflow/base/optimization.py index f2e747c9c..f7aa9f491 100644 --- a/nlp/language_model/bert/tensorflow/base/optimization.py +++ b/nlp/language_model/bert/tensorflow/base/optimization.py @@ -84,7 +84,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, manual_fp if hvd and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)): optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True) if use_fp16: - loss_scaler = tf.train.experimental.DynamicLossScale( + loss_scaler = tf.compat.v1.mixed_precision.DynamicLossScale( initial_loss_scale=init_loss_scale, increment_period=1000, multiplier=2.0) optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scaler) loss_scale_value = tf.identity(loss_scaler(), name="loss_scale") diff --git a/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh b/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh index 64b9094fb..683dc6696 100644 --- a/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh +++ b/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,7 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. - set -x bash ./reset.sh @@ -34,33 +33,27 @@ fi date +%m%d%H%M%S >> ${LOG_DIR}/time.log CUDA_VISIBLE_DEVICES=0 python3 ./run_pretraining.py \ - --eval_files_dir=./bert_pretrain_tf_records/eval_data \ - --bert_config_file=./bert_pretrain_tf_ckpt/bert_config.json \ - --input_files_dir=./bert_pretrain_tf_records/train_data \ - --train_batch_size=6 \ - --init_checkpoint=./bert_pretrain_tf_ckpt/model.ckpt-28252 \ --output_dir=${OUTPUT_DIR} \ --do_train=True \ - --do_train=True \ - --do_eval=True \ + --do_eval=False \ --is_dist_eval_enabled=False \ --eval_batch_size=24 \ --max_eval_steps=100 \ --max_predictions_per_seq=76 \ --max_seq_length=512 \ - --num_train_steps=2000 \ - --num_accumulation_steps=4 \ + --num_train_steps=13206 \ + --num_accumulation_steps=1 \ --num_warmup_steps=0 \ - --save_checkpoints_steps=20000 \ + --save_checkpoints_steps=1000 \ --learning_rate=5e-5 \ - --horovod --amp --nouse_xla \ + --amp --nouse_xla \ --allreduce_post_accumulation=True \ - --enable_device_warmup=True \ + --enable_device_warmup=False \ --samples_between_eval=150000 \ --stop_threshold=0.72 \ - --samples_start_eval=100 \ + --samples_start_eval=3000000 \ --dllog_path=${OUTPUT_DIR}/bert_dllog.json "$@" rm -rf ${OUTPUT_DIR}/* -date +%m%d%H%M%S >> ${LOG_DIR}/time.log \ No newline at end of file +date +%m%d%H%M%S >> ${LOG_DIR}/time.log diff --git a/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh b/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh index 7ce2b86c6..16087a784 100644 --- a/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh +++ b/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,12 +14,11 @@ # License for the specific language governing permissions and limitations # under the License. - set -x : ${HOROVOD_RUN_ARGS:="--gloo"} -# bash ./reset.sh +bash ./reset.sh DATE=`date +%m%d%H%M%S` @@ -35,15 +34,8 @@ fi date +%m%d%H%M%S >> ${LOG_DIR}/time.log -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 - # Training phase -horovodrun -np 8 ${HOROVOD_RUN_ARGS} python3 ./run_pretraining.py \ - --eval_files_dir=./bert_pretrain_tf_records/eval_data \ - --bert_config_file=./bert_pretrain_tf_ckpt/bert_config.json \ - --input_files_dir=./bert_pretrain_tf_records/train_data \ - --train_batch_size=6 \ - --init_checkpoint=./bert_pretrain_tf_ckpt/model.ckpt-28252 \ +horovodrun -np ${IX_NUM_CUDA_VISIBLE_DEVICES} ${HOROVOD_RUN_ARGS} python3 ./run_pretraining.py \ --output_dir=${OUTPUT_DIR} \ --do_train=True \ --do_eval=True \ @@ -70,4 +62,4 @@ exit_code=$? rm -rf ${OUTPUT_DIR}/* date +%m%d%H%M%S >> ${LOG_DIR}/time.log -exit ${exit_code} \ No newline at end of file +exit ${exit_code} diff --git a/nlp/language_model/bert/tensorflow/base/run_pretraining.py b/nlp/language_model/bert/tensorflow/base/run_pretraining.py index e2078b962..8efd7b550 100644 --- a/nlp/language_model/bert/tensorflow/base/run_pretraining.py +++ b/nlp/language_model/bert/tensorflow/base/run_pretraining.py @@ -995,7 +995,7 @@ def main(_): ''' if FLAGS.do_eval: if FLAGS.horovod: - if hvd.rank() is not 0: + if hvd.rank() != 0: return converged = False num_steps_between_eval = math.ceil(FLAGS.samples_between_eval / global_batch_size) -- Gitee