diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md index b39fda0a8fed7460cfaaf7183111895166894a43..722b82f64979b4f8c6eafc1322bbf3e0fd0ad43c 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/README.cn.md @@ -104,3 +104,17 @@ python3 -m torch.distributed.launch --nproc_per_node 8 run_mlm.py \ --output_dir ./output # 输出保存路径 ``` +### Q&A + +1. Q:第一次运行报类似"xxx **socket timeout** xxx"的错误该怎么办? + + A:第一次运行tokenizer会对单词进行预处理,根据您的数据集大小,耗时不同,若时间过长,可能导致HCCL通信超时。此时可以通过设置以下环境变量,设置较大的超时时间阈值(单位秒,默认为600秒): + + ``` + export HCCL_CONNECT_TIMEOUT=3600 + ``` + + + + + diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt index bc4f8bc394880bc697253f715dc3d57812b58503..b264df8cf0b24f0f569c84e284573e9d5995e392 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/requirements.txt @@ -4,3 +4,4 @@ tokenizers sentencepiece != 0.1.92 protobuf wikiextractor +sklearn diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py index b341e36238d53ccaacda2399a4ba667a56d7fb88..39bb0946eabdbee1cf73926bc8e98e242bba599f 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py @@ -1075,7 +1075,7 @@ class BertModel(BertPreTrainedModel): return_dict=return_dict, ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = self.pooler(sequence_output.view(bs, from_seq_len, -1)) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] diff --git a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py index 640bb2dbcf7cf2f93b32bbac2dfc20a3f780266c..2ade6810fc9602e71c3c9f7324a7449ce2bb8680 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_for_PyTorch/transformers/src/transformers/trainer.py @@ -1051,8 +1051,11 @@ class Trainer: return model # Mixed precision training with apex (torch < 1.6) - if self.use_apex and training: - model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad) + if self.use_apex: + if training: + model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad) + elif self.optimizer is None: + model = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level, loss_scale=self.args.loss_scale, combine_grad=self.args.use_combine_grad) # Multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: