From 6a5786c699bd3bc13a8c1eeb6580be1e61d72a52 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 14 Nov 2024 07:18:38 +0000
Subject: [PATCH] 11/14/2024 07:18:38 sync TensorFlow file

---
 .../inceptionv3/tensorflow/get_num_devices.sh |  6 +-
 .../run_train_distributed_imagenette.sh       | 67 +++++++++++--------
 .../run_train_inception3_imagenette.sh        | 22 +++---
 ...un_train_inception3_multigpu_imagenette.sh | 23 ++++---
 .../resnet50/tensorflow/benchmark_cnn.py      |  5 ++
 .../run_train_distributed_imagenette.sh       |  6 +-
 .../vgg/tensorflow/get_num_devices.sh         |  9 +--
 .../run_train_distributed_imagenette.sh       | 65 ++++++++++--------
 .../tensorflow/run_train_vgg16_imagenette.sh  | 26 ++++---
 .../run_train_vgg16_multigpu_imagenette.sh    | 26 ++++---
 .../bert/tensorflow/base/README.md            | 24 +++++--
 .../bert/tensorflow/base/init_tf.sh           |  5 +-
 .../bert/tensorflow/base/optimization.py      |  2 +-
 .../bert/tensorflow/base/run_1card_FPS.sh     | 25 +++----
 .../tensorflow/base/run_multi_card_FPS.sh     | 16 ++---
 .../bert/tensorflow/base/run_pretraining.py   |  2 +-
 16 files changed, 187 insertions(+), 142 deletions(-)

diff --git a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh
index 7c6036a71..a9c370895 100644
--- a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh
+++ b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -19,8 +19,8 @@ if [ -n "$devices"  ]; then
     _devices=(${devices//,/ })
     num_devices=${#_devices[@]}
 else
-    num_devices=8
-    export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    num_devices=2
+    export CUDA_VISIBLE_DEVICES=0,1
     echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}"
 fi
 export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices}
diff --git a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh
index 1787fb081..1abf5d2c0 100644
--- a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh
+++ b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh
@@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1
 
 i=0
 model="alexnet"
-for arg in "$@"; do
+for arg in "$@"
+do
     if [ $i -eq 0 ]; then
         model=$arg
         let i++
@@ -40,11 +41,12 @@ for arg in "$@"; do
 done
 echo "## Training model: ${model}"
 
+
 : ${BATCH_SIZE:=16}
 # TRAIN_EPOCHS=10
 # optional optimizer: momentum, rmsprop, momentum, sgd
 OPTIMIZER=momentum
-DATE=$(date +%Y%m%d%H%M%S)
+DATE=`date +%Y%m%d%H%M%S`
 
 LOG_DIR="logs/${model}_distributed"
 DATA_DIR=./imagenette
@@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status() {
+check_status()
+{
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
@@ -66,7 +69,7 @@ check_status() {
 # Prepare devices
 #################################################
 devices=$CUDA_VISIBLE_DEVICES
-if [ -n "$devices" ]; then
+if [ -n "$devices"  ]; then
     devices=(${devices//,/ })
     num_devices=${#devices[@]}
 else
@@ -86,7 +89,8 @@ fi
 #################################################
 worker_hosts=""
 i=0
-for device in "${devices[@]}"; do
+for device in "${devices[@]}";
+do
     if [ "$i" == "0" ]; then
         let i++
         continue
@@ -102,13 +106,13 @@ echo "worker_hosts: ${worker_hosts}"
 #################################################
 trap ctrl_c INT
 function ctrl_c() {
-    echo "*** Trapped CTRL-C, killing process running background"
-    for pid in "${pid_list[@]}"; do
-        echo "Killing pid ${pid}"
-        kill ${pid}
-        wait ${pid}
-    done
-    exit 0
+  echo "*** Trapped CTRL-C, killing process running background"
+  for pid in "${pid_list[@]}"; do
+    echo "Killing pid ${pid}"
+    kill ${pid}
+    wait ${pid}
+  done
+  exit 0
 }
 
 #################################################
@@ -116,9 +120,10 @@ function ctrl_c() {
 #################################################
 
 pid_list=()
-last_device=$(expr ${num_devices} - 1)
+last_device=`expr ${num_devices} - 1`
 i=0
-for device in "${devices[@]}"; do
+for device in "${devices[@]}";
+do
     job_name="worker"
     if [ "${i}" == "0" ]; then
         job_name="ps"
@@ -127,26 +132,30 @@ for device in "${devices[@]}"; do
     if [ ${i} -le 1 ]; then
         task_index=0
     else
-        task_index=$(expr ${i} - 1)
+        task_index=`expr ${i} - 1`
     fi
 
     if [ "${i}" == "${last_device}" ]; then
-        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-            --data_format=NCHW \
-            --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \
-            --batch_size=${BATCH_SIZE} --model=${model} \
-            --variable_update=distributed_replicated \
-            --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \
-            --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
-        [[ ${PIPESTATUS[0]} == 0 ]] || exit
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+         --data_name=imagenette --data_dir=${DATA_DIR}\
+         --data_format=NCHW \
+         --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\
+         --local_parameter_device=gpu --num_gpus=${num_devices}\
+         --batch_size=${BATCH_SIZE} --model=${model} \
+         --variable_update=distributed_replicated \
+         --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\
+         --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
         echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}"
     else
-        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-            --data_format=NCHW \
-            --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \
-            --batch_size=${BATCH_SIZE} --model=${model} \
-            --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \
-            --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" &
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+         --data_name=imagenette --data_dir=${DATA_DIR}\
+         --data_format=NCHW \
+         --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\
+         --local_parameter_device=gpu --num_gpus=${num_devices}\
+         --batch_size=${BATCH_SIZE} --model=${model}\
+         --variable_update=distributed_replicated\
+         --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\
+         --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" &
         echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}"
     fi
     let i++
diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh
index 12337021f..9c44c920f 100644
--- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh
+++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh
@@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1
 #TRAIN_EPOCHS=10
 # optional optimizer: momentum, rmsprop, momentum, sgd
 OPTIMIZER=rmsprop
-DATE=$(date +%Y%m%d%H%M%S)
+DATE=`date +%Y%m%d%H%M%S`
 
 LOG_DIR="logs/inception3"
 DATA_DIR=./imagenette
@@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status() {
+check_status()
+{
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
 }
 
 i=0
-for arg in "$@"; do
+for arg in "$@"
+do
     if [[ $arg =~ "--epoch" ]]; then
         new_args[$i]="--num_epochs"
     else
@@ -51,10 +53,14 @@ for arg in "$@"; do
     let i++
 done
 
-python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-    --data_format=NCHW --batch_size=${BATCH_SIZE} \
-    --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1 --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
-    --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
-[[ ${PIPESTATUS[0]} == 0 ]] || exit
+python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
 
 exit ${EXIT_STATUS}
diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh
index c48d0cbfd..6a8938ec5 100644
--- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh
+++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh
@@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1
 #TRAIN_EPOCHS=10
 # optional optimizer: momentum, rmsprop, momentum, sgd
 OPTIMIZER=rmsprop
-DATE=$(date +%Y%m%d%H%M%S)
+DATE=`date +%Y%m%d%H%M%S`
 
 LOG_DIR="logs/inception3_multigpu"
 DATA_DIR=./imagenette
@@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status() {
+check_status()
+{
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
 }
 
 i=0
-for arg in "$@"; do
+for arg in "$@"
+do
     if [[ $arg =~ "--epoch" ]]; then
         new_args[$i]="--num_epochs"
     else
@@ -53,11 +55,14 @@ done
 
 source ./get_num_devices.sh
 
-UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-    --data_format=NCHW --batch_size=${BATCH_SIZE} \
-    --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \
-    --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
-    --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
-[[ ${PIPESTATUS[0]} == 0 ]] || exit
+UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
 
 exit ${EXIT_STATUS}
diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn.py b/cv/classification/resnet50/tensorflow/benchmark_cnn.py
index 6f65ea69b..7f6c1db4c 100644
--- a/cv/classification/resnet50/tensorflow/benchmark_cnn.py
+++ b/cv/classification/resnet50/tensorflow/benchmark_cnn.py
@@ -31,9 +31,11 @@ import re
 import threading
 import time
 import traceback
+import sys
 
 from absl import flags as absl_flags
 import numpy as np
+import math
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -881,6 +883,9 @@ def benchmark_one_step(sess,
     lossval = results['average_loss']
   else:
     lossval = 0.
+  if not math.isfinite(lossval):
+    print("Loss is {}, stopping training".format(lossval))
+    sys.exit(1)
   if image_producer is not None:
     image_producer.notify_image_consumption()
   train_time = time.time() - start_time
diff --git a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
index 98639e5c9..f4f48223c 100644
--- a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
+++ b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -14,7 +14,6 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-
 bash ./get_imagenette.sh
 
 export TF_CUDNN_USE_AUTOTUNE=1
@@ -43,7 +42,7 @@ done
 echo "## Training model: ${model}"
 
 
-: ${BATCH_SIZE:=32}
+: ${BATCH_SIZE:=16}
 # TRAIN_EPOCHS=10
 # optional optimizer: momentum, rmsprop, momentum, sgd
 OPTIMIZER=momentum
@@ -111,6 +110,7 @@ function ctrl_c() {
   for pid in "${pid_list[@]}"; do
     echo "Killing pid ${pid}"
     kill ${pid}
+    wait ${pid}
   done
   exit 0
 }
diff --git a/cv/classification/vgg/tensorflow/get_num_devices.sh b/cv/classification/vgg/tensorflow/get_num_devices.sh
index 14d6c0a5a..a9c370895 100644
--- a/cv/classification/vgg/tensorflow/get_num_devices.sh
+++ b/cv/classification/vgg/tensorflow/get_num_devices.sh
@@ -1,4 +1,5 @@
-# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+#!/bin/bash
+# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -18,8 +19,8 @@ if [ -n "$devices"  ]; then
     _devices=(${devices//,/ })
     num_devices=${#_devices[@]}
 else
-    num_devices=8
-    export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    num_devices=2
+    export CUDA_VISIBLE_DEVICES=0,1
     echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}"
 fi
-export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices}
\ No newline at end of file
+export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices}
diff --git a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh
index 36b41ee85..c7642b986 100644
--- a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh
+++ b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh
@@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1
 
 i=0
 model="alexnet"
-for arg in "$@"; do
+for arg in "$@"
+do
     if [ $i -eq 0 ]; then
         model=$arg
         let i++
@@ -40,11 +41,12 @@ for arg in "$@"; do
 done
 echo "## Training model: ${model}"
 
+
 : ${BATCH_SIZE:=16}
 # TRAIN_EPOCHS=10
 # optional optimizer: momentum, rmsprop, momentum, sgd
 OPTIMIZER=momentum
-DATE=$(date +%Y%m%d%H%M%S)
+DATE=`date +%Y%m%d%H%M%S`
 
 LOG_DIR="logs/${model}_distributed"
 DATA_DIR=./imagenette
@@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status() {
+check_status()
+{
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
@@ -66,7 +69,7 @@ check_status() {
 # Prepare devices
 #################################################
 devices=$CUDA_VISIBLE_DEVICES
-if [ -n "$devices" ]; then
+if [ -n "$devices"  ]; then
     devices=(${devices//,/ })
     num_devices=${#devices[@]}
 else
@@ -86,7 +89,8 @@ fi
 #################################################
 worker_hosts=""
 i=0
-for device in "${devices[@]}"; do
+for device in "${devices[@]}";
+do
     if [ "$i" == "0" ]; then
         let i++
         continue
@@ -102,12 +106,12 @@ echo "worker_hosts: ${worker_hosts}"
 #################################################
 trap ctrl_c INT
 function ctrl_c() {
-    echo "*** Trapped CTRL-C, killing process running background"
-    for pid in "${pid_list[@]}"; do
-        echo "Killing pid ${pid}"
-        kill ${pid}
-    done
-    exit 0
+  echo "*** Trapped CTRL-C, killing process running background"
+  for pid in "${pid_list[@]}"; do
+    echo "Killing pid ${pid}"
+    kill ${pid}
+  done
+  exit 0
 }
 
 #################################################
@@ -115,9 +119,10 @@ function ctrl_c() {
 #################################################
 
 pid_list=()
-last_device=$(expr ${num_devices} - 1)
+last_device=`expr ${num_devices} - 1`
 i=0
-for device in "${devices[@]}"; do
+for device in "${devices[@]}";
+do
     job_name="worker"
     if [ "${i}" == "0" ]; then
         job_name="ps"
@@ -126,26 +131,30 @@ for device in "${devices[@]}"; do
     if [ ${i} -le 1 ]; then
         task_index=0
     else
-        task_index=$(expr ${i} - 1)
+        task_index=`expr ${i} - 1`
     fi
 
     if [ "${i}" == "${last_device}" ]; then
-        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-            --data_format=NCHW \
-            --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \
-            --batch_size=${BATCH_SIZE} --model=${model} \
-            --variable_update=distributed_replicated \
-            --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \
-            --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
-        [[ ${PIPESTATUS[0]} == 0 ]] || exit
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+         --data_name=imagenette --data_dir=${DATA_DIR}\
+         --data_format=NCHW \
+         --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\
+         --local_parameter_device=gpu --num_gpus=${num_devices}\
+         --batch_size=${BATCH_SIZE} --model=${model} \
+         --variable_update=distributed_replicated \
+         --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\
+         --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
         echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}"
     else
-        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-            --data_format=NCHW \
-            --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \
-            --batch_size=${BATCH_SIZE} --model=${model} \
-            --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \
-            --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" &
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+         --data_name=imagenette --data_dir=${DATA_DIR}\
+         --data_format=NCHW \
+         --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\
+         --local_parameter_device=gpu --num_gpus=${num_devices}\
+         --batch_size=${BATCH_SIZE} --model=${model}\
+         --variable_update=distributed_replicated\
+         --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\
+         --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" &
         echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}"
     fi
     let i++
diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh
index aeea001bc..343dbe372 100644
--- a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh
+++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh
@@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1
 #TRAIN_EPOCHS=10
 # optional optimizer: adam, rmsprop, momentum, sgd
 OPTIMIZER=momentum
-DATE=$(date +%Y%m%d%H%M%S)
+DATE=`date +%Y%m%d%H%M%S`
 
 LOG_DIR="logs/vgg16"
 DATA_DIR=./imagenette
@@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status() {
+check_status()
+{
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
 }
 
 i=0
-for arg in "$@"; do
+for arg in "$@"
+do
     if [[ $arg =~ "--epoch" ]]; then
         new_args[$i]="--num_epochs"
     else
@@ -51,13 +53,15 @@ for arg in "$@"; do
     let i++
 done
 
-python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-    --data_format=NCHW --batch_size=${BATCH_SIZE} \
-    --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \
-    --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
-    --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 \
-    --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \
-    --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
-[[ ${PIPESTATUS[0]} == 0 ]] || exit
+python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
 
 exit ${EXIT_STATUS}
diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh
index 4de70ff72..d4c2bcd2a 100644
--- a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh
+++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh
@@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1
 #TRAIN_EPOCHS=10
 # optional optimizer: adam, rmsprop, momentum, sgd
 OPTIMIZER=momentum
-DATE=$(date +%Y%m%d%H%M%S)
+DATE=`date +%Y%m%d%H%M%S`
 
 LOG_DIR="logs/vgg16_multigpu"
 DATA_DIR=./imagenette
@@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status() {
+check_status()
+{
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
 }
 
 i=0
-for arg in "$@"; do
+for arg in "$@"
+do
     if [[ $arg =~ "--epoch" ]]; then
         new_args[$i]="--num_epochs"
     else
@@ -53,13 +55,15 @@ done
 
 source ./get_num_devices.sh
 
-UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
-    --data_format=NCHW --batch_size=${BATCH_SIZE} \
-    --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \
-    --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
-    --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \
-    --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \
-    --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
-[[ ${PIPESTATUS[0]} == 0 ]] || exit
+UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
 
 exit ${EXIT_STATUS}
diff --git a/nlp/language_model/bert/tensorflow/base/README.md b/nlp/language_model/bert/tensorflow/base/README.md
index a77ae4cec..42ae860eb 100644
--- a/nlp/language_model/bert/tensorflow/base/README.md
+++ b/nlp/language_model/bert/tensorflow/base/README.md
@@ -1,4 +1,4 @@
-# BERT Pretraining
+    # BERT Pretraining
 
 ## Model description
 BERT, or Bidirectional Encoder Representations from Transformers, improves upon standard Transformers by removing the unidirectionality constraint by using a masked language model (MLM) pre-training objective. The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked word based only on its context. Unlike left-to-right language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pre-train a deep bidirectional Transformer. In addition to the masked language model, BERT uses a next sentence prediction task that jointly pre-trains text-pair representations.
@@ -9,6 +9,12 @@ BERT, or Bidirectional Encoder Representations from Transformers, improves upon
 
 ```shell
 bash init_tf.sh
+wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.7.tar.gz
+tar xf openmpi-4.0.7.tar.gz
+cd openmpi-4.0.7/
+./configure --prefix=/usr/local/bin --with-orte
+make -j4 && make install
+export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 ```
 
 ### Download datasets
@@ -34,16 +40,26 @@ tips: you can git clone this repo in other place ,we need the bert_pretrain_tf_r
 ### Training on single card
 
 ```shell
-bash run_1card_FPS.sh
+bash run_1card_FPS.sh --input_files_dir=/path/to/bert_pretrain_tf_records/train_data \
+        --init_checkpoint=/path/to/bert_pretrain_ckpt_tf/model.ckpt-28252 \
+        --eval_files_dir=/path/to/bert_pretrain_tf_records/eval_data \
+        --train_batch_size=6 \
+        --bert_config_file=/path/to/bert_pretrain_ckpt_tf/bert_config.json
 ```
 
 ### Training on mutil-cards
 ```shell
-bash run_multi_card_FPS.sh 
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export IX_NUM_CUDA_VISIBLE_DEVICES=8
+bash run_multi_card_FPS.sh --input_files_dir=/path/to/bert_pretrain_tf_records/train_data \
+        --init_checkpoint=/path/to/bert_pretrain_ckpt_tf/model.ckpt-28252 \
+        --eval_files_dir=/path/to/bert_pretrain_tf_records/eval_data \
+        --train_batch_size=6 \
+        --bert_config_file=/path/to/bert_pretrain_ckpt_tf/bert_config.json
 ```
  
 ## Result
 
 |               | acc       |       fps |
 | ---           | ---       | ---       |
-|    multi_card |  0.424126  | 0.267241|
+|    multi_card |  0.424126  | 0.267241|
\ No newline at end of file
diff --git a/nlp/language_model/bert/tensorflow/base/init_tf.sh b/nlp/language_model/bert/tensorflow/base/init_tf.sh
index 08f27c35b..79e2ae63b 100644
--- a/nlp/language_model/bert/tensorflow/base/init_tf.sh
+++ b/nlp/language_model/bert/tensorflow/base/init_tf.sh
@@ -13,7 +13,8 @@
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
-
+pip3 uninstall -y protobuf
+pip3 install "protobuf<4.0.0"
 pip3 install git+https://github.com/mlperf/logging.git
 pip3 install git+https://github.com/NVIDIA/dllogger.git
-pip3 install pandas==0.24
\ No newline at end of file
+pip3 install pandas==1.3.5
\ No newline at end of file
diff --git a/nlp/language_model/bert/tensorflow/base/optimization.py b/nlp/language_model/bert/tensorflow/base/optimization.py
index f2e747c9c..f7aa9f491 100644
--- a/nlp/language_model/bert/tensorflow/base/optimization.py
+++ b/nlp/language_model/bert/tensorflow/base/optimization.py
@@ -84,7 +84,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, manual_fp
   if hvd and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
     optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True)
   if use_fp16:
-    loss_scaler = tf.train.experimental.DynamicLossScale(
+    loss_scaler = tf.compat.v1.mixed_precision.DynamicLossScale(
         initial_loss_scale=init_loss_scale, increment_period=1000, multiplier=2.0)
     optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scaler)
     loss_scale_value = tf.identity(loss_scaler(), name="loss_scale")
diff --git a/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh b/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh
index 64b9094fb..683dc6696 100644
--- a/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh
+++ b/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -14,7 +14,6 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-
 set -x
 
 bash ./reset.sh
@@ -34,33 +33,27 @@ fi
 date +%m%d%H%M%S >> ${LOG_DIR}/time.log
 
 CUDA_VISIBLE_DEVICES=0 python3 ./run_pretraining.py \
-	--eval_files_dir=./bert_pretrain_tf_records/eval_data \
-	--bert_config_file=./bert_pretrain_tf_ckpt/bert_config.json \
-	--input_files_dir=./bert_pretrain_tf_records/train_data \
-	--train_batch_size=6  \
-    --init_checkpoint=./bert_pretrain_tf_ckpt/model.ckpt-28252 \
 	--output_dir=${OUTPUT_DIR} \
 	--do_train=True \
-	--do_train=True \
-	--do_eval=True \
+	--do_eval=False \
 	--is_dist_eval_enabled=False \
 	--eval_batch_size=24 \
 	--max_eval_steps=100 \
 	--max_predictions_per_seq=76 \
 	--max_seq_length=512 \
-	--num_train_steps=2000 \
-	--num_accumulation_steps=4 \
+	--num_train_steps=13206 \
+	--num_accumulation_steps=1 \
 	--num_warmup_steps=0 \
-	--save_checkpoints_steps=20000 \
+	--save_checkpoints_steps=1000 \
 	--learning_rate=5e-5 \
-	--horovod --amp --nouse_xla \
+	--amp --nouse_xla \
 	--allreduce_post_accumulation=True \
-	--enable_device_warmup=True \
+	--enable_device_warmup=False \
 	--samples_between_eval=150000 \
 	--stop_threshold=0.72 \
-	--samples_start_eval=100 \
+	--samples_start_eval=3000000 \
 	--dllog_path=${OUTPUT_DIR}/bert_dllog.json "$@"
 
 rm -rf ${OUTPUT_DIR}/*
 
-date +%m%d%H%M%S >> ${LOG_DIR}/time.log
\ No newline at end of file
+date +%m%d%H%M%S >> ${LOG_DIR}/time.log
diff --git a/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh b/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh
index 7ce2b86c6..16087a784 100644
--- a/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh
+++ b/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -14,12 +14,11 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-
 set -x
 
 : ${HOROVOD_RUN_ARGS:="--gloo"}
 
-# bash ./reset.sh
+bash ./reset.sh
 
 DATE=`date +%m%d%H%M%S`
 
@@ -35,15 +34,8 @@ fi
 
 date +%m%d%H%M%S >> ${LOG_DIR}/time.log
 
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
 # Training phase
-horovodrun -np 8 ${HOROVOD_RUN_ARGS}  python3 ./run_pretraining.py \
-	--eval_files_dir=./bert_pretrain_tf_records/eval_data \
-	--bert_config_file=./bert_pretrain_tf_ckpt/bert_config.json \
-	--input_files_dir=./bert_pretrain_tf_records/train_data \
-	--train_batch_size=6  \
-    --init_checkpoint=./bert_pretrain_tf_ckpt/model.ckpt-28252 \
+horovodrun -np ${IX_NUM_CUDA_VISIBLE_DEVICES} ${HOROVOD_RUN_ARGS} python3 ./run_pretraining.py \
 	--output_dir=${OUTPUT_DIR} \
 	--do_train=True \
 	--do_eval=True \
@@ -70,4 +62,4 @@ exit_code=$?
 rm -rf ${OUTPUT_DIR}/*
 
 date +%m%d%H%M%S >> ${LOG_DIR}/time.log
-exit ${exit_code}
\ No newline at end of file
+exit ${exit_code}
diff --git a/nlp/language_model/bert/tensorflow/base/run_pretraining.py b/nlp/language_model/bert/tensorflow/base/run_pretraining.py
index e2078b962..8efd7b550 100644
--- a/nlp/language_model/bert/tensorflow/base/run_pretraining.py
+++ b/nlp/language_model/bert/tensorflow/base/run_pretraining.py
@@ -995,7 +995,7 @@ def main(_):
   '''
   if FLAGS.do_eval:
     if FLAGS.horovod:
-      if hvd.rank() is not 0:
+      if hvd.rank() != 0:
         return
     converged = False
     num_steps_between_eval = math.ceil(FLAGS.samples_between_eval / global_batch_size)
-- 
Gitee