From 1f0266430e8f90596ae1ae2bd63365e88d33a57c Mon Sep 17 00:00:00 2001 From: lijing Date: Tue, 19 Apr 2022 10:49:44 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9readme=EF=BC=8C=E4=BF=AE?= =?UTF-8?q?=E5=A4=8Dbertpool=E7=BB=B4=E5=BA=A6=E4=B8=8D=E4=B8=80=E8=87=B4?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../nlp/Bert_Chinese_for_PyTorch/README.cn.md | 14 ++++++++++++++ .../nlp/Bert_Chinese_for_PyTorch/requirements.txt | 1 + .../src/transformers/models/bert/modeling_bert.py | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md index b39fda0a8f..722b82f649 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md @@ -104,3 +104,17 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ --output_dir ./output # 输出保存路径 ``` +### Q&A + +1. Q:第一次运行报类似"xxx **socket timeout** xxx"的错误该怎么办? + + A:第一次运行tokenizer会对单词进行预处理,根据您的数据集大小,耗时不同,若时间过长,可能导致HCCL通信超时。此时可以通过设置以下环境变量,设置较大的超时时间阈值(单位秒,默认为600秒): + + ``` + export HCCL_CONNECT_TIMEOUT=3600 + ``` + + + + + diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt index bc4f8bc394..b264df8cf0 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt @@ -4,3 +4,4 @@ tokenizers sentencepiece != 0.1.92 protobuf wikiextractor +sklearn diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py index b341e36238..39bb0946ea 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py @@ -1075,7 +1075,7 @@ class BertModel(BertPreTrainedModel): return_dict=return_dict, ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = self.pooler(sequence_output.view(bs, from_seq_len, -1)) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] -- Gitee From 99f45fff0e9123a7ae4c9c4fe0740d875fd57d6a Mon Sep 17 00:00:00 2001 From: lijing Date: Tue, 19 Apr 2022 13:12:09 +0800 Subject: [PATCH 2/2] =?UTF-8?q?eval=E6=B7=B7=E5=90=88=E7=B2=BE=E5=BA=A6?= =?UTF-8?q?=E9=80=BB=E8=BE=91=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../transformers/src/transformers/trainer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py index 640bb2dbcf..2ade6810fc 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py @@ -1051,8 +1051,11 @@ class Trainer: return model # Mixed precision training with apex (torch < 1.6) - if self.use_apex and training: - model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad) + if self.use_apex: + if training: + model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad) + elif self.optimizer is None: + model = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad) # Multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: -- Gitee