From b772bd1349f94eae83637ce68141b8ebbd63ef0b Mon Sep 17 00:00:00 2001 From: zhangyinxia Date: Wed, 17 Dec 2025 17:22:21 +0800 Subject: [PATCH] update docs for msrun start --- official/cv/FasterRCNN/README.md | 6 ++-- official/cv/FasterRCNN/README_CN.md | 11 +++--- official/cv/ResNet/README.md | 22 +++++------- official/cv/ResNet/README_CN.md | 22 +++++------- .../scripts/run_parameter_server_train.sh | 34 +++++++------------ official/nlp/Bert/README.md | 10 +++--- official/nlp/Bert/README_CN.md | 10 +++--- 7 files changed, 49 insertions(+), 66 deletions(-) diff --git a/official/cv/FasterRCNN/README.md b/official/cv/FasterRCNN/README.md index 130130c80..431edc285 100644 --- a/official/cv/FasterRCNN/README.md +++ b/official/cv/FasterRCNN/README.md @@ -144,7 +144,7 @@ python -m src.convert_checkpoint --ckpt_file=[BACKBONE_MODEL] bash run_standalone_train_ascend.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) # distributed training -bash run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) +bash run_distribute_train_ascend_msrun.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) # eval bash run_eval_ascend.sh [VALIDATION_JSON_FILE] [CHECKPOINT_PATH] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) @@ -204,7 +204,7 @@ bash scripts/docker_start.sh fasterrcnn:20.1.0 [DATA_DIR] [MODEL_DIR] bash run_standalone_train_ascend.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) # distributed training -bash run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) +bash run_distribute_train_ascend_msrun.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) ``` 4. Eval @@ -404,7 +404,7 @@ elif backbone == "resnet_v1_50": bash run_standalone_train_ascend.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) # distributed training on ascend -bash run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) +bash run_distribute_train_ascend_msrun.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) ``` #### on GPU diff --git a/official/cv/FasterRCNN/README_CN.md b/official/cv/FasterRCNN/README_CN.md index ed214c0af..86d44108a 100644 --- a/official/cv/FasterRCNN/README_CN.md +++ b/official/cv/FasterRCNN/README_CN.md @@ -178,7 +178,7 @@ python -m src.convert_checkpoint --ckpt_file=[BACKBONE_MODEL] bash run_standalone_train_ascend.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) # 分布式训练 -bash run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) +bash run_distribute_train_ascend_msrun.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) # 评估 bash run_eval_ascend.sh [VALIDATION_JSON_FILE] [CHECKPOINT_PATH] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) @@ -238,7 +238,7 @@ bash scripts/docker_start.sh fasterrcnn:20.1.0 [DATA_DIR] [MODEL_DIR] bash run_standalone_train_ascend.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) # 分布式训练 -bash run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) +bash run_distribute_train_ascend_msrun.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) ``` 4. 评估 @@ -436,7 +436,7 @@ elif backbone == "resnet_v1_50": bash run_standalone_train_ascend.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [DEVICE_ID] [MINDRECORD_DIR](optional) # Ascend分布式训练 -bash run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) +bash run_distribute_train_ascend_msrun.sh [PRETRAINED_MODEL] [BACKBONE] [COCO_ROOT] [MINDRECORD_DIR](optional) ``` #### 在GPU上运行 @@ -458,8 +458,7 @@ python train.py --config_path=[CONFIG_PATH] --pre_trained=[PRE_TRAINED] --coco_r Notes: -1. 运行分布式任务时需要用到RANK_TABLE_FILE指定的rank_table.json。您可以使用[hccl_tools](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools)生成该文件。 -2. PRETRAINED_MODEL应该是训练好的ResNet-50检查点。如果需要加载训练好的FasterRcnn的检查点,需要对train.py作如下修改: +1. PRETRAINED_MODEL应该是训练好的ResNet-50检查点。如果需要加载训练好的FasterRcnn的检查点,需要对train.py作如下修改: ```python # 注释掉如下代码 @@ -485,7 +484,7 @@ Notes: load_param_into_net(net, param_dict) ``` -3. default_config.yaml、default_config_101.yaml、default_config_152.yaml或default_config_InceptionResnetV2.yaml中包含原数据集路径,可以选择“coco_root”或“image_dir”。 +2. default_config.yaml、default_config_101.yaml、default_config_152.yaml或default_config_InceptionResnetV2.yaml中包含原数据集路径,可以选择“coco_root”或“image_dir”。 ### 结果 diff --git a/official/cv/ResNet/README.md b/official/cv/ResNet/README.md index 0a08a869f..13ed2aec7 100644 --- a/official/cv/ResNet/README.md +++ b/official/cv/ResNet/README.md @@ -149,7 +149,7 @@ After installing MindSpore via the official website, you can start training and ```bash # distributed training for 8p -Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) +Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) # standalone training Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) @@ -437,7 +437,7 @@ Note: In order to obtain better performance, the batch_size is recommended to be ```bash # distributed training for 8p -Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) +Usage: bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) # standalone training Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) @@ -450,11 +450,7 @@ Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] [DEVICE_I the `LOCAL_DEVICE_NUM` and `RANK_SIZE` in `run_distribute_train.sh`, `LOCAL_DEVICE_NUM` means the number of single machine cards, `RANK_SIZE` means total number of cards used in training. -For distributed training, a hccl configuration file with JSON format needs to be created in advance. - -Please follow the instructions in the link [hccn_tools](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools). - -Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the following in log. +Training result will be stored in the example path, whose folder name begins with "rank", "train" or "train_parallel". Under this, you can find checkpoint file together with result like the following in log. If you want to change device_id for standalone training, you can set environment variable `export DEVICE_ID=x` or set `device_id=x` in context. @@ -487,7 +483,7 @@ Please follow the instructions in the link [GPU-Multi-Host](https://www.mindspor - Parameter server training Ascend example ```bash -bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) +bash run_parameter_server_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional) ``` - Parameter server training GPU example @@ -501,11 +497,11 @@ bash run_parameter_server_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT ```bash # evaluation with distributed training Ascend example: cd scripts/ -bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] +bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] # example of reasoning during distributed breakpoint training: cd scripts/ -bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] [RESUME_CKPT] +bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] [RESUME_CKPT] # evaluation with standalone training Ascend example: cd scripts/ @@ -539,7 +535,7 @@ Users can choose to shutdown the cache server after training or leave it alone f ```text # distributed training for 8p -用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT] +用法:bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT] # standalone training 用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT] @@ -956,10 +952,10 @@ bash run_distribute_train_gpu.sh ../pruner/uni_pruning/ ../pruner/uni_pruning/re # For UniPruning on Ascend config.device_target = 'Ascend' # distributed training example, apply UniPruning and train from pretrained checkpoint -bash scripts/run_distribute_train.sh /path/to/rank_table_file pruner/uni_pruning/ pruner/uni_pruning/resnet50_config.yaml /path/to/dataset FP32 ./checkpoint/resnet-90.ckpt +bash scripts/run_distribute_train_msrun.sh pruner/uni_pruning/ pruner/uni_pruning/resnet50_config.yaml /path/to/dataset FP32 ./checkpoint/resnet-90.ckpt # standalone training example, apply UniPruning and train from pretrained checkpoint -bash scripts/run_standalone_train.sh pruner/uni_pruning/ /path/to/rank_table_file pruner/uni_pruning/resnet50_config.yaml /path/to/dataset FP32 ./checkpoint/resnet-90.ckpt +bash scripts/run_standalone_train.sh pruner/uni_pruning/ pruner/uni_pruning/resnet50_config.yaml /path/to/dataset FP32 ./checkpoint/resnet-90.ckpt ``` ## Evaluation Process diff --git a/official/cv/ResNet/README_CN.md b/official/cv/ResNet/README_CN.md index b030f3c67..fedf23b01 100644 --- a/official/cv/ResNet/README_CN.md +++ b/official/cv/ResNet/README_CN.md @@ -156,7 +156,7 @@ ResNet的总体网络架构如下: ```text # 8卡分布式训练 -用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) +用法:bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) # 单机训练 用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) @@ -433,7 +433,7 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ```text # 8卡分布式训练 -用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) +用法:bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) # 单机训练 用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) @@ -446,11 +446,7 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] `run_distribute_train.sh` 是8卡训练脚本示例,如果想使用其他卡数训练,需要修改`run_distribute_train.sh`脚本中的 `LOCAL_DEVICE_NUM` 和 `RANK_SIZE` ,`LOCAL_DEVICE_NUM` 是单机使用的卡数,`RANK_SIZE` 是训练使用的总卡数。 -分布式训练需要提前创建JSON格式的HCCL配置文件。 - -具体操作,参见[hccn_tools](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools)中的说明。 - -训练结果保存在示例路径中,文件夹名称以“train”或“train_parallel”开头。您可在此路径下的日志中找到检查点文件以及结果,如下所示。 +训练结果保存在示例路径中,文件夹名称以“rank”, “train”或“train_parallel”开头。您可在此路径下的日志中找到检查点文件以及结果,如下所示。 运行单卡用例时如果想更换运行卡号,可以通过设置环境变量 `export DEVICE_ID=x` 或者在context中设置 `device_id=x`指定相应的卡号。 @@ -472,7 +468,7 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] - Ascend参数服务器训练示例 ```text -bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) +bash run_parameter_server_train.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](可选) ``` - GPU参数服务器训练示例 @@ -486,11 +482,11 @@ bash run_parameter_server_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT ```bash # Ascend 8卡分布式训练时推理示例: cd scripts/ -bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] +bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] # Ascend 分布式断点训练时推理示例: cd scripts/ -bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] [RESUME_CKPT] +bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL] [EVAL_DATASET_PATH] [RESUME_CKPT] # Ascend 单机训练时推理示例: cd scripts/ @@ -566,7 +562,7 @@ python eval.py --config_path ./cpu_default_config.yaml --data_path ./dataset/flo ```text # 8卡分布式训练 -用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH] +用法:bash run_distribute_train_msrun.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH] # 单机训练 用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH] @@ -994,10 +990,10 @@ bash run_distribute_train_gpu.sh ../pruner/scop/ ../pruner/scop/resnet50_cifar10 # 分布式训练 cd ./golden_stick/scripts/ # PYTHON_PATH 表示需要应用的算法的'train.py'脚本所在的目录。 -bash run_distribute_train.sh [RANK_TABLE_FILE] [PYTHON_PATH] [CONFIG_PATH] [DATASET_PATH] [CKPT_TYPE](optional) [CKPT_PATH](optional) +bash run_distribute_train_msrun.sh [PYTHON_PATH] [CONFIG_PATH] [DATASET_PATH] [CKPT_TYPE](optional) [CKPT_PATH](optional) # 分布式训练示例(SCOP算法使用多卡训练) -bash run_distribute_train.sh /path/to/rank_table_file ../pruner/scop/ ../pruner/scop/resnet50_cifar10_config.yaml /path/to/dataset +bash run_distribute_train_msrun.sh ../pruner/scop/ ../pruner/scop/resnet50_cifar10_config.yaml /path/to/dataset # 单机训练 cd ./golden_stick/scripts/ diff --git a/official/cv/ResNet/scripts/run_parameter_server_train.sh b/official/cv/ResNet/scripts/run_parameter_server_train.sh index df1e1a529..28d50beae 100644 --- a/official/cv/ResNet/scripts/run_parameter_server_train.sh +++ b/official/cv/ResNet/scripts/run_parameter_server_train.sh @@ -14,9 +14,9 @@ # limitations under the License. # ============================================================================ -if [ $# != 3 ] && [ $# != 4 ] +if [ $# != 2 ] && [ $# != 3 ] then - echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" + echo "Usage: bash run_distribute_train.sh [DATASET_PATH] [CONFIG_PATH] [RESUME_CKPT](optional)" exit 1 fi @@ -28,28 +28,21 @@ get_real_path(){ fi } -PATH1=$(get_real_path $1) -PATH2=$(get_real_path $2) -CONFIG_FILE=$(get_real_path $3) +PATH2=$(get_real_path $1) +CONFIG_FILE=$(get_real_path $2) -if [ $# == 4 ] +if [ $# == 3 ] then - RESUME_CKPT=$(get_real_path $4) + RESUME_CKPT=$(get_real_path $3) fi -if [ ! -f $PATH1 ] -then - echo "error: RANK_TABLE_FILE=$PATH1 is not a file" - exit 1 -fi - if [ ! -d $PATH2 ] then echo "error: DATASET_PATH=$PATH2 is not a directory" exit 1 fi -if [ $# == 4 ] && [ ! -f $RESUME_CKPT ] +if [ $# == 3 ] && [ ! -f $RESUME_CKPT ] then echo "error: RESUME_CKPT=$RESUME_CKPT is not a file" exit 1 @@ -60,7 +53,6 @@ ulimit -u unlimited export LOCAL_DEVICE_NUM=8 # Total number of cards used in training export RANK_SIZE=8 -export RANK_TABLE_FILE=$PATH1 export MS_SCHED_NUM=1 export MS_WORKER_NUM=$RANK_SIZE @@ -82,13 +74,13 @@ cp *.sh ./sched cp -r ../src ./sched cd ./sched || exit echo "start scheduler" -if [ $# == 3 ] +if [ $# == 2 ] then python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \ --config_path=$CONFIG_FILE --output_dir '../outputs' &> sched.log & fi -if [ $# == 4 ] +if [ $# == 3 ] then python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --RESUME_CKPT=$RESUME_CKPT \ --config_path=$CONFIG_FILE --output_dir '../outputs' &> sched.log & @@ -108,13 +100,13 @@ do cp -r ../src ./server_$i cd ./server_$i || exit echo "start server" - if [ $# == 3 ] + if [ $# == 2 ] then python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \ --config_path=$CONFIG_FILE --output_dir '../outputs' &> server_$i.log & fi - if [ $# == 4 ] + if [ $# == 3 ] then python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --resume_ckpt=$RESUME_CKPT \ --config_path=$CONFIG_FILE --output_dir '../outputs' &> server_$i.log & @@ -137,13 +129,13 @@ do cd ./worker_$i || exit echo "start training for worker rank $RANK_ID, device $DEVICE_ID" env > env.log - if [ $# == 3 ] + if [ $# == 2 ] then python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 --parameter_server=True \ --config_path=$CONFIG_FILE --output_dir '../outputs' &> worker_$i.log & fi - if [ $# == 4 ] + if [ $# == 3 ] then python train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \ --parameter_server=True --resume_ckpt=$RESUME_CKPT \ diff --git a/official/nlp/Bert/README.md b/official/nlp/Bert/README.md index d63c72c34..30f1a07b9 100644 --- a/official/nlp/Bert/README.md +++ b/official/nlp/Bert/README.md @@ -161,7 +161,7 @@ After installing MindSpore via the official website, you can start pre-training, bash scripts/run_standalone_pretrain_ascend.sh 0 1 /path/cn-wiki-128 # run distributed pre-training example -bash scripts/run_distributed_pretrain_ascend.sh /path/cn-wiki-128 /path/hccl.json +bash scripts/run_distributed_pretrain_ascend_msrun.sh /path/cn-wiki-128 /path/hccl.json # run the evaluation for pre-training example # Modify the `eval_ckpt` and `eval_data_dir` in pretrain_config.yaml @@ -230,10 +230,10 @@ If you want to run in modelarts, please check the official documentation of [mod # (5) Perform a or b. # a. setting parameters in /{path}/bert/pretrain_config.yaml. # 1. Set ”enable_modelarts=True“ - # 2. Set other parameters, other parameter configuration can refer to `./scripts/run_distributed_pretrain_ascend.sh` + # 2. Set other parameters, other parameter configuration can refer to `./scripts/run_distributed_pretrain_ascend_msrun.sh` # b. adding on the website UI interface. # 1. Add ”enable_modelarts=True“ - # 3. Add other parameters, other parameter configuration can refer to `./scripts/run_distributed_pretrain_ascend.sh` + # 3. Add other parameters, other parameter configuration can refer to `./scripts/run_distributed_pretrain_ascend_msrun.sh` # (6) Upload the dataset to S3 bucket. # (7) Check the "data storage location" on the website UI interface and set the "Dataset path" path (there is only data or zip package under this path). # (8) Set the "Output file path" and "Job log path" to your path on the website UI interface. @@ -696,11 +696,11 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outputs are (Tensor(shape=[1] Before distribute pretrain on ascend, you need to generate distributed_cmd.sh as follows: ```python -python scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./scripts/run_distributed_pretrain_ascend.sh --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/data_dir/ --hccl_config /path/hccl.json --cmd_file ./distributed_cmd.sh +python scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./scripts/run_distributed_pretrain_ascend_msrun.sh --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/data_dir/ --hccl_config /path/hccl.json --cmd_file ./distributed_cmd.sh ``` ```bash -bash scripts/run_distributed_pretrain_ascend.sh /path/cn-wiki-128 /path/hccl.json +bash scripts/run_distributed_pretrain_ascend_msrun.sh /path/cn-wiki-128 /path/hccl.json ``` The command above will run in the background, you can view training logs in pretraining_log.txt. After training finished, you will get some checkpoint files under the LOG* folder by default. The loss value will be displayed as follows: diff --git a/official/nlp/Bert/README_CN.md b/official/nlp/Bert/README_CN.md index 9e2705c2d..ccafe08f4 100644 --- a/official/nlp/Bert/README_CN.md +++ b/official/nlp/Bert/README_CN.md @@ -166,7 +166,7 @@ bash scripts/run_standalone_pretrain_ascend.sh 0 1 /path/cn-wiki-128 # 分布式运行预训练示例 -bash scripts/run_distributed_pretrain_ascend.sh /path/cn-wiki-128 /path/hccl.json +bash scripts/run_distributed_pretrain_ascend_msrun.sh /path/cn-wiki-128 /path/hccl.json # 单独运行训练评估脚本示例 @@ -240,10 +240,10 @@ bash scripts/run_distributed_pretrain_for_gpu.sh 8 40 /path/cn-wiki-128 # (5) 执行a或b # a. 在 /{path}/bert/default_config.yaml 文件中设置参数 # 1. 设置 ”enable_modelarts=True“ - # 2. 设置其它参数,其它参数配置可以参考 `./scripts/run_distributed_pretrain_ascend.sh` + # 2. 设置其它参数,其它参数配置可以参考 `./scripts/run_distributed_pretrain_ascend_msrun.sh` # b. 在 网页上设置 # 1. 添加 ”run_distributed=True“ - # 2. 添加其它参数,其它参数配置可以参考 `./scripts/run_distributed_pretrain_ascend.sh` + # 2. 添加其它参数,其它参数配置可以参考 `./scripts/run_distributed_pretrain_ascend_msrun.sh` # (6) 上传你的 数据 到 s3 桶上 # (7) 在网页上勾选数据存储位置,设置“训练数据集”路径 # (8) 在网页上设置“训练输出文件路径”、“作业日志路径” @@ -692,11 +692,11 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outputs are (Tensor(shape=[1] 在多卡运行之前,您可以按以下操作生成distributed_cmd.sh: ```python -python scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./scripts/run_distributed_pretrain_ascend.sh --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/data_dir/ --hccl_config /path/hccl.json --cmd_file ./distributed_cmd.sh +python scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./scripts/run_distributed_pretrain_ascend_msrun.sh --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/data_dir/ --hccl_config /path/hccl.json --cmd_file ./distributed_cmd.sh ``` ```bash -bash scripts/run_distributed_pretrain_ascend.sh /path/cn-wiki-128 /path/hccl.json +bash scripts/run_distributed_pretrain_ascend_msrun.sh /path/cn-wiki-128 /path/hccl.json ``` 以上命令后台运行,您可以在pretraining_log.txt中查看训练日志。训练结束后,您可以在默认LOG*文件夹下找到检查点文件,得到如下损失值: -- Gitee