diff --git a/nlp/language_model/bert/pytorch/README.md b/nlp/language_model/bert/pytorch/README.md index 050689b3fb7aef8dda3ef65c6cfc1bd7c04efa0c..bec27fdfdf000bd62b4d2428ef7b63426e0c6419 100644 --- a/nlp/language_model/bert/pytorch/README.md +++ b/nlp/language_model/bert/pytorch/README.md @@ -25,7 +25,11 @@ Reference: [training_results_v1.0](https://github.com/mlcommons/training_results ### Install Dependencies ```shell -bash init.sh +apt install -y git numactl +pip install h5py +pip install psutil +pip install mlperf-logging +pip install boto3 ``` ## Model Training diff --git a/nlp/language_model/bert/pytorch/optim/distributed_fused_lamb.py b/nlp/language_model/bert/pytorch/optim/distributed_fused_lamb.py index 150d9e3d0e7035b70dd284953ac239bbb7696b67..0b72aeb76a139da15e8627ab3f6a77ec30093b30 100644 --- a/nlp/language_model/bert/pytorch/optim/distributed_fused_lamb.py +++ b/nlp/language_model/bert/pytorch/optim/distributed_fused_lamb.py @@ -43,7 +43,7 @@ def _pipeline_block_reductions_patched(self, block_id): rs_stream.wait_stream(torch.cuda.current_stream()) rs_stream.wait_stream(self._l2_grad_norm_st) with torch.cuda.stream(rs_stream): - works[chunk_id] = torch.distributed.reduce_scatter(self._fp16_g_chunks[block_id][chunk_id],self._flat_grads_shards[block_id][chunk_id],group=self._rs_pg[glob_chunk_id%self._num_rs_pg],async_op=True,no_copy=True) + works[chunk_id] = torch.distributed.reduce_scatter(self._fp16_g_chunks[block_id][chunk_id],self._flat_grads_shards[block_id][chunk_id],group=self._rs_pg[glob_chunk_id%self._num_rs_pg],async_op=True) # Reduction across nodes for each rank if self._num_groups > 1: @@ -118,6 +118,6 @@ def _pipeline_step_patched(self): self._contrib_weight_decay, global_grad_norm, self._use_nvlamb) - torch.distributed.all_gather(self._new_params_mega_shards, self._fp16_p, group=self._ag_pg[0], no_copy=True) + torch.distributed.all_gather(self._new_params_mega_shards, self._fp16_p, group=self._ag_pg[0])