diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md index 722b82f64979b4f8c6eafc1322bbf3e0fd0ad43c..3aeaaf928a18e13657685e043a3cbc110d64b7f1 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md @@ -108,12 +108,20 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ 1. Q:第一次运行报类似"xxx **socket timeout** xxx"的错误该怎么办? - A:第一次运行tokenizer会对单词进行预处理,根据您的数据集大小,耗时不同,若时间过长,可能导致HCCL通信超时。此时可以通过设置以下环境变量,设置较大的超时时间阈值(单位秒,默认为600秒): + A:第一次运行tokenizer会对单词进行预处理,根据您的数据集大小,耗时不同,若时间过长,可能导致等待超时。此时可以通过设置较大的超时时间阈值尝试解决: + (1)设置pytorch框架内置超时时间,修改脚本中的distributed_process_group_timeout(单位秒)为更大的值,例如设置为7200: + ``` - export HCCL_CONNECT_TIMEOUT=3600 +--distributed_process_group_timeout 7200 ``` - + + (2)设置HCCL的建链时间为更大的值,修改env.sh中环境变量HCCL_CONNECT_TIMEOUT(单位秒)的值: + + ``` + export HCCL_CONNECT_TIMEOUT=7200 + ``` + diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh index 8ba618f127884eecf35325df27baff70e4d29bfc..d72034c89be7c6f845761c844078c7388b30060a 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh @@ -44,6 +44,7 @@ export DYNAMIC_OP="ADD#MUL" #HCCL白名单开关,1-关闭/0-开启 export HCCL_WHITELIST_DISABLE=1 export HCCL_IF_IP=$(hostname -I |awk '{print $1}') +export HCCL_CONNECT_TIMEOUT=5400 #设置device侧日志登记为error ${install_path}/driver/tools/msnpureport -g error -d 0 diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh index 99d52812eee9a3581f32fa7529e0ad5518c0ffde..66737acdf3ec99fd80bffe4c20416c10c33ede29 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh @@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ --loss_scale 8192 \ --use_combine_grad \ --optim adamw_apex_fused_npu \ + --distributed_process_group_timeout 5400 \ --output_dir ./output \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh index afa52ae7deeb94599b4e6db9a942ecee4f2f737f..6d81435b1e326360ee9bcfd35c26d5d081e05767 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh @@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ --loss_scale 8192 \ --use_combine_grad \ --optim adamw_apex_fused_npu \ + --distributed_process_group_timeout 5400 \ --output_dir ./output \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py index dbad5e03e0f567ecf19a20dd251c5ddc9911661d..09dee52967b4ff73053626691813908ae5a4ab62 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py @@ -16,6 +16,7 @@ import contextlib import json import math import os +import datetime import warnings from dataclasses import asdict, dataclass, field from enum import Enum @@ -710,6 +711,13 @@ class TrainingArguments: "`DistributedDataParallel`." }, ) + distributed_process_group_timeout: Optional[int] = field( + default=1800, + metadata={ + "help": "Timeout(seconds) for operations executed against the process group, the value of the flag `timeout` passed to " + "`init_process_group`." + }, + ) dataloader_pin_memory: bool = field( default=True, metadata={"help": "Whether or not to pin memory for DataLoader."} ) @@ -1076,7 +1084,7 @@ class TrainingArguments: else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs - torch.distributed.init_process_group(backend="hccl") + torch.distributed.init_process_group(backend="hccl", timeout=datetime.timedelta(seconds=self.distributed_process_group_timeout)) device = torch.device("npu", self.local_rank) self._n_gpu = 1