From 16c41861033b524b994d8e0aaccb98b497f17f89 Mon Sep 17 00:00:00 2001
From: lijing <lijing202@huawei.com>
Date: Sun, 24 Apr 2022 18:48:03 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=88=86=E5=B8=83=E5=BC=8F?=
 =?UTF-8?q?=E8=AE=A1=E7=AE=97=E8=B6=85=E6=97=B6=E5=8F=AF=E9=85=8D=E7=BD=AE?=
 =?UTF-8?q?=E5=8F=82=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../nlp/Bert_Chinese_for_PyTorch/README.cn.md      | 14 +++++++++++---
 .../built-in/nlp/Bert_Chinese_for_PyTorch/env.sh   |  1 +
 .../run_mlm_bertbase_8p.sh                         |  1 +
 .../run_mlm_bertlarge_8p.sh                        |  1 +
 .../transformers/src/transformers/training_args.py | 10 +++++++++-
 5 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
index 722b82f649..3aeaaf928a 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
@@ -108,12 +108,20 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \
 
 1. Q:第一次运行报类似"xxx **socket timeout** xxx"的错误该怎么办？
 
-   A:第一次运行tokenizer会对单词进行预处理，根据您的数据集大小，耗时不同，若时间过长，可能导致HCCL通信超时。此时可以通过设置以下环境变量，设置较大的超时时间阈值（单位秒，默认为600秒）：
+   A:第一次运行tokenizer会对单词进行预处理，根据您的数据集大小，耗时不同，若时间过长，可能导致等待超时。此时可以通过设置较大的超时时间阈值尝试解决：
 
+   （1）设置pytorch框架内置超时时间，修改脚本中的distributed_process_group_timeout（单位秒）为更大的值，例如设置为7200：
+   
    ```
-   export HCCL_CONNECT_TIMEOUT=3600
+--distributed_process_group_timeout 7200
    ```
-
+   
+   （2）设置HCCL的建链时间为更大的值，修改env.sh中环境变量HCCL_CONNECT_TIMEOUT（单位秒）的值：
+   
+   ```
+   export HCCL_CONNECT_TIMEOUT=7200
+   ```
+   
    
 
 
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh
index 8ba618f127..d72034c89b 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh
@@ -44,6 +44,7 @@ export DYNAMIC_OP="ADD#MUL"
 #HCCL白名单开关,1-关闭/0-开启
 export HCCL_WHITELIST_DISABLE=1
 export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+export HCCL_CONNECT_TIMEOUT=5400
 
 #设置device侧日志登记为error
 ${install_path}/driver/tools/msnpureport -g error -d 0
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh
index 99d52812ee..66737acdf3 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh
@@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \
         --loss_scale 8192 \
         --use_combine_grad \
         --optim adamw_apex_fused_npu \
+        --distributed_process_group_timeout 5400 \
         --output_dir ./output
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh
index afa52ae7de..6d81435b1e 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh
@@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \
         --loss_scale 8192 \
         --use_combine_grad \
         --optim adamw_apex_fused_npu \
+        --distributed_process_group_timeout 5400 \
         --output_dir ./output
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py
index dbad5e03e0..09dee52967 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py
@@ -16,6 +16,7 @@ import contextlib
 import json
 import math
 import os
+import datetime
 import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
@@ -710,6 +711,13 @@ class TrainingArguments:
             "`DistributedDataParallel`."
         },
     )
+    distributed_process_group_timeout: Optional[int] = field(
+        default=1800,
+        metadata={
+            "help": "Timeout(seconds) for operations executed against the process group, the value of the flag `timeout` passed to "
+            "`init_process_group`."
+        },
+    )    
     dataloader_pin_memory: bool = field(
         default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
     )
@@ -1076,7 +1084,7 @@ class TrainingArguments:
         else:
             # Here, we'll use torch.distributed.
             # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-            torch.distributed.init_process_group(backend="hccl")
+            torch.distributed.init_process_group(backend="hccl", timeout=datetime.timedelta(seconds=self.distributed_process_group_timeout))
             device = torch.device("npu", self.local_rank)
             self._n_gpu = 1
 
-- 
Gitee