diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
index 722b82f64979b4f8c6eafc1322bbf3e0fd0ad43c..3aeaaf928a18e13657685e043a3cbc110d64b7f1 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
@@ -108,12 +108,20 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \
 
 1. Q:第一次运行报类似"xxx **socket timeout** xxx"的错误该怎么办？
 
-   A:第一次运行tokenizer会对单词进行预处理，根据您的数据集大小，耗时不同，若时间过长，可能导致HCCL通信超时。此时可以通过设置以下环境变量，设置较大的超时时间阈值（单位秒，默认为600秒）：
+   A:第一次运行tokenizer会对单词进行预处理，根据您的数据集大小，耗时不同，若时间过长，可能导致等待超时。此时可以通过设置较大的超时时间阈值尝试解决：
 
+   （1）设置pytorch框架内置超时时间，修改脚本中的distributed_process_group_timeout（单位秒）为更大的值，例如设置为7200：
+   
    ```
-   export HCCL_CONNECT_TIMEOUT=3600
+--distributed_process_group_timeout 7200
    ```
-
+   
+   （2）设置HCCL的建链时间为更大的值，修改env.sh中环境变量HCCL_CONNECT_TIMEOUT（单位秒）的值：
+   
+   ```
+   export HCCL_CONNECT_TIMEOUT=7200
+   ```
+   
    
 
 
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh
index 8ba618f127884eecf35325df27baff70e4d29bfc..d72034c89be7c6f845761c844078c7388b30060a 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/env.sh
@@ -44,6 +44,7 @@ export DYNAMIC_OP="ADD#MUL"
 #HCCL白名单开关,1-关闭/0-开启
 export HCCL_WHITELIST_DISABLE=1
 export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+export HCCL_CONNECT_TIMEOUT=5400
 
 #设置device侧日志登记为error
 ${install_path}/driver/tools/msnpureport -g error -d 0
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh
index 99d52812eee9a3581f32fa7529e0ad5518c0ffde..66737acdf3ec99fd80bffe4c20416c10c33ede29 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertbase_8p.sh
@@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \
         --loss_scale 8192 \
         --use_combine_grad \
         --optim adamw_apex_fused_npu \
+        --distributed_process_group_timeout 5400 \
         --output_dir ./output
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh
index afa52ae7deeb94599b4e6db9a942ecee4f2f737f..6d81435b1e326360ee9bcfd35c26d5d081e05767 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/run_mlm_bertlarge_8p.sh
@@ -22,4 +22,5 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \
         --loss_scale 8192 \
         --use_combine_grad \
         --optim adamw_apex_fused_npu \
+        --distributed_process_group_timeout 5400 \
         --output_dir ./output
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py
index dbad5e03e0f567ecf19a20dd251c5ddc9911661d..09dee52967b4ff73053626691813908ae5a4ab62 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/training_args.py
@@ -16,6 +16,7 @@ import contextlib
 import json
 import math
 import os
+import datetime
 import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
@@ -710,6 +711,13 @@ class TrainingArguments:
             "`DistributedDataParallel`."
         },
     )
+    distributed_process_group_timeout: Optional[int] = field(
+        default=1800,
+        metadata={
+            "help": "Timeout(seconds) for operations executed against the process group, the value of the flag `timeout` passed to "
+            "`init_process_group`."
+        },
+    )    
     dataloader_pin_memory: bool = field(
         default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
     )
@@ -1076,7 +1084,7 @@ class TrainingArguments:
         else:
             # Here, we'll use torch.distributed.
             # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-            torch.distributed.init_process_group(backend="hccl")
+            torch.distributed.init_process_group(backend="hccl", timeout=datetime.timedelta(seconds=self.distributed_process_group_timeout))
             device = torch.device("npu", self.local_rank)
             self._n_gpu = 1