diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
index b39fda0a8fed7460cfaaf7183111895166894a43..722b82f64979b4f8c6eafc1322bbf3e0fd0ad43c 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md
@@ -104,3 +104,17 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \
         --output_dir ./output                            # 输出保存路径
 ```
 
+### Q&A
+
+1. Q:第一次运行报类似"xxx **socket timeout** xxx"的错误该怎么办？
+
+   A:第一次运行tokenizer会对单词进行预处理，根据您的数据集大小，耗时不同，若时间过长，可能导致HCCL通信超时。此时可以通过设置以下环境变量，设置较大的超时时间阈值（单位秒，默认为600秒）：
+
+   ```
+   export HCCL_CONNECT_TIMEOUT=3600
+   ```
+
+   
+
+
+
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt
index bc4f8bc394880bc697253f715dc3d57812b58503..b264df8cf0b24f0f569c84e284573e9d5995e392 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt
@@ -4,3 +4,4 @@ tokenizers
 sentencepiece != 0.1.92
 protobuf
 wikiextractor
+sklearn
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py
index b341e36238d53ccaacda2399a4ba667a56d7fb88..39bb0946eabdbee1cf73926bc8e98e242bba599f 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py
@@ -1075,7 +1075,7 @@ class BertModel(BertPreTrainedModel):
             return_dict=return_dict,
         )
         sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        pooled_output = self.pooler(sequence_output.view(bs, from_seq_len, -1)) if self.pooler is not None else None
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py
index 640bb2dbcf7cf2f93b32bbac2dfc20a3f780266c..2ade6810fc9602e71c3c9f7324a7449ce2bb8680 100644
--- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py
+++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py
@@ -1051,8 +1051,11 @@ class Trainer:
             return model
 
         # Mixed precision training with apex (torch < 1.6)
-        if self.use_apex and training:
-            model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad)
+        if self.use_apex:
+            if training:
+                model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad)
+            elif self.optimizer is None:
+                model = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad)
 
         # Multi-gpu training (should be after apex fp16 initialization)
         if self.args.n_gpu > 1: