From 16c41861033b524b994d8e0aaccb98b497f17f89 Mon Sep 17 00:00:00 2001 From: lijing Date: Sun, 24 Apr 2022 18:48:03 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=88=86=E5=B8=83=E5=BC=8F?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E8=B6=85=E6=97=B6=E5=8F=AF=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../nlp/Bert_Chinese_for_PyTorch/README.cn.md | 14 +++++++++++--- .../built-in/nlp/Bert_Chinese_for_PyTorch/env.sh | 1 + .../run_mlm_bertbase_8p.sh | 1 + .../run_mlm_bertlarge_8p.sh | 1 + .../transformers/src/transformers/training_args.py | 10 +++++++++- 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md index 722b82f649..3aeaaf928a 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md @@ -108,12 +108,20 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ 1. Q:第一次运行报类似"xxx **socket timeout** xxx"的错误该怎么办? - A:第一次运行tokenizer会对单词进行预处理,根据您的数据集大小,耗时不同,若时间过长,可能导致HCCL通信超时。此时可以通过设置以下环境变量,设置较大的超时时间阈值(单位秒,默认为600秒): + A:第一次运行tokenizer会对单词进行预处理,根据您的数据集大小,耗时不同,若时间过长,可能导致等待超时。此时可以通过设置较大的超时时间阈值尝试解决: + (1)设置pytorch框架内置超时时间,修改脚本中的distributed_process_group_timeout(单位秒)为更大的值,例如设置为7200: + ``` - export HCCL_CONNECT_TIMEOUT=3600 +--distributed_process_group_timeout 7200 ``` - + + (2)设置HCCL的建链时间为更大的值,修改env.sh中环境变量HCCL_CONNECT_TIMEOUT(单位秒)的值: + + ``` + export HCCL_CONNECT_TIMEOUT=7200 + ``` + diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh index 8ba618f127..d72034c89b 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh @@ -44,6 +44,7 @@ export DYNAMIC_OP="ADD#MUL" #HCCL白名单开关,1-关闭/0-开启 export HCCL_WHITELIST_DISABLE=1 export HCCL_IF_IP=$(hostname -I |awk '{print $1}') +export HCCL_CONNECT_TIMEOUT=5400 #设置device侧日志登记为error ${install_path}/driver/tools/msnpureport -g error -d 0 diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh index 99d52812ee..66737acdf3 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh @@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ --loss_scale 8192 \ --use_combine_grad \ --optim adamw_apex_fused_npu \ + --distributed_process_group_timeout 5400 \ --output_dir ./output \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh index afa52ae7de..6d81435b1e 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh @@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ --loss_scale 8192 \ --use_combine_grad \ --optim adamw_apex_fused_npu \ + --distributed_process_group_timeout 5400 \ --output_dir ./output \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py index dbad5e03e0..09dee52967 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py @@ -16,6 +16,7 @@ import contextlib import json import math import os +import datetime import warnings from dataclasses import asdict, dataclass, field from enum import Enum @@ -710,6 +711,13 @@ class TrainingArguments: "`DistributedDataParallel`." }, ) + distributed_process_group_timeout: Optional[int] = field( + default=1800, + metadata={ + "help": "Timeout(seconds) for operations executed against the process group, the value of the flag `timeout` passed to " + "`init_process_group`." + }, + ) dataloader_pin_memory: bool = field( default=True, metadata={"help": "Whether or not to pin memory for DataLoader."} ) @@ -1076,7 +1084,7 @@ class TrainingArguments: else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs - torch.distributed.init_process_group(backend="hccl") + torch.distributed.init_process_group(backend="hccl", timeout=datetime.timedelta(seconds=self.distributed_process_group_timeout)) device = torch.device("npu", self.local_rank) self._n_gpu = 1 -- Gitee