diff --git a/configs/general/llm_finetune_dense_template.yaml b/configs/general/llm_finetune_dense_template.yaml index 2daf15914795da206dcc76abeceb02234347bca9..7fb37b72ed2fb47dc79e2ad93e6edb186dd26af0 100644 --- a/configs/general/llm_finetune_dense_template.yaml +++ b/configs/general/llm_finetune_dense_template.yaml @@ -87,8 +87,6 @@ parallel_config: context_parallel: 1 # Set the number of sequence parallel use_seq_parallel: True # Corresponding to Megatron Short Sequence Parallelism micro_batch_num: 2 # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1 - vocab_emb_dp: False # Shard embedding in model parallel or data parallel. If True, the embedding lookup -# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. micro_batch_interleave_num: 1 # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value # callbacks diff --git a/configs/general/llm_finetune_moe_template.yaml b/configs/general/llm_finetune_moe_template.yaml index 8379a68cd31ce2a37bd95f01e6ed0b145ca42dd3..bcab079acb6a52d5d0ca67e7401fe2fda5da27d9 100644 --- a/configs/general/llm_finetune_moe_template.yaml +++ b/configs/general/llm_finetune_moe_template.yaml @@ -101,8 +101,6 @@ parallel_config: context_parallel: 1 # Set the number of sequence parallel use_seq_parallel: True # Corresponding to Megatron Short Sequence Parallelism micro_batch_num: 2 # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1 - vocab_emb_dp: False # Shard embedding in model parallel or data parallel. If True, the embedding lookup -# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. micro_batch_interleave_num: 1 # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value # callbacks diff --git a/configs/general/llm_pretrain_dense_template.yaml b/configs/general/llm_pretrain_dense_template.yaml index 6eafb3dab0e97b52779165c4e735d6f554673f98..77bc4b30c29492f64a8da5d42e2132dbebfe860e 100644 --- a/configs/general/llm_pretrain_dense_template.yaml +++ b/configs/general/llm_pretrain_dense_template.yaml @@ -91,8 +91,6 @@ parallel_config: context_parallel: 1 # Set the number of sequence parallel use_seq_parallel: True # Corresponding to Megatron Short Sequence Parallelism micro_batch_num: 2 # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1 - vocab_emb_dp: False # Shard embedding in model parallel or data parallel. If True, the embedding lookup -# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. micro_batch_interleave_num: 1 # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value # callbacks diff --git a/configs/general/llm_pretrain_moe_template.yaml b/configs/general/llm_pretrain_moe_template.yaml index 4d03a999cb2318ede14b27b9799cfd0ececd6083..05866361108d7bc7886e1ea8546c26174873d24c 100644 --- a/configs/general/llm_pretrain_moe_template.yaml +++ b/configs/general/llm_pretrain_moe_template.yaml @@ -105,8 +105,6 @@ parallel_config: context_parallel: 1 # Set the number of sequence parallel use_seq_parallel: True # Corresponding to Megatron Short Sequence Parallelism micro_batch_num: 2 # Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1 - vocab_emb_dp: False # Shard embedding in model parallel or data parallel. If True, the embedding lookup -# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. micro_batch_interleave_num: 1 # Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value # callbacks diff --git a/configs/general/run_general_task.yaml b/configs/general/run_general_task.yaml index 0086c02da29770d0438b5f0352a075b0d98e59c7..bf4c79887e7e784755eeb361e761c1f99809961b 100644 --- a/configs/general/run_general_task.yaml +++ b/configs/general/run_general_task.yaml @@ -46,7 +46,6 @@ parallel_config: pipeline_stage: 1 use_seq_parallel: False micro_batch_num: 1 - vocab_emb_dp: True gradient_aggregation_group: 4 micro_batch_interleave_num: 1 diff --git a/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml b/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml index fcf03a6dfb4ae7e03d260205b870415fe587c571..bc398dea778984f5bafb9064d02031efe4ad727c 100644 --- a/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml +++ b/configs/qwen3_moe/pretrain_qwen3_30b_a3b_4k.yaml @@ -101,7 +101,6 @@ parallel_config: pipeline_stage: 4 micro_batch_num: µ_batch_num 16 expert_parallel: 4 - vocab_emb_dp: True use_seq_parallel: True gradient_aggregation_group: 1 # when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. diff --git a/mindformers/core/parallel_config.py b/mindformers/core/parallel_config.py index bf67e7812fee88f0d36d11d1d8b2adc666c36702..726ef5b89caee899c8c2b96aaef51210ecdc217d 100644 --- a/mindformers/core/parallel_config.py +++ b/mindformers/core/parallel_config.py @@ -61,14 +61,14 @@ def build_parallel_config(config): if not isinstance(config.parallel_config, TransformerOpParallelConfig): logger.info("initial parallel_config from dict: %s", config.parallel_config) if config.parallel_config.auto_parallel or config.parallel_config.pipeline_stage > 1: - logger.info("pipeline_stage = %s > 1, vocab_emd_dp will be reset to False.", + logger.warning("pipeline_stage = %s > 1, vocab_emb_dp will be reset to False.", config.parallel_config.pipeline_stage) config.parallel_config.vocab_emb_dp = False _set_rp_matmul_mem_coef(config.parallel_config.get('mem_coeff', 0.1)) if config.parallel_config.context_parallel_algo and \ config.parallel_config.context_parallel_algo == "hybird_cp": - logger.warning(f"context_parallel_algo `hybird_cp` will not take effect in later versions, " - f"and will be replaced by `hybrid_cp` in the new version.") + logger.warning("context_parallel_algo `hybird_cp` will not take effect in later versions, " + "and will be replaced by `hybrid_cp` in the new version.") config.parallel_config = TransformerOpParallelConfig(recompute=config.recompute_config, swap=config.swap_config, **config.parallel_config) diff --git a/mindformers/parallel_core/model_parallel_config.py b/mindformers/parallel_core/model_parallel_config.py index 54ce51219a4deb1183d343cb9c46acb466480762..2b31f3090b2a946b0d6ac7b83cec905239a5ed95 100644 --- a/mindformers/parallel_core/model_parallel_config.py +++ b/mindformers/parallel_core/model_parallel_config.py @@ -93,7 +93,11 @@ class ModelParallelConfig: """ vocab_emb_dp: Optional[bool] = False - """Whether to split the vocabulary only along the dp dimension. Default: True.""" + """ + Whether to split the vocabulary only along the dp dimension. + This setting is not supported to be configured as True at present; + otherwise, it will be converted to False automatically. + """ ################### # Training diff --git a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py index 76c3e45ef6d16f648d31be8c3ddd7d76c60547a8..9979dabc3d40b36546225f6204766d4df2ee8029 100644 --- a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py +++ b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py @@ -106,6 +106,8 @@ class VocabParallelEmbedding(nn.Cell): self.weight = Parameter(init_method([self.num_embeddings, self.embedding_dim]), name="weight") self.embedding_morph = P.Morph( self.embedding_func, embedding_infer_shape, embedding_infer_dtype).add_prim_attr("self_define_shard", True) + self.enable_embedding_tp = self.tp > 1 and self.num_embeddings % self.tp == 0 + self.reshape = Reshape() self.config = config if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation(): @@ -144,7 +146,7 @@ class VocabParallelEmbedding(nn.Cell): bs, seq_len = input_.shape _, hidden = weight.shape input_ = self.reshape(input_, (bs * seq_len,)) - if self.tp > 1: + if self.enable_embedding_tp: # Build the mask. # Mask the input. input_ = input_ - self.vocab_start_index masked_input = self.relu(input_) @@ -157,7 +159,7 @@ class VocabParallelEmbedding(nn.Cell): output_parallel = mint.nn.functional.embedding(masked_input, weight) # Mask the output embedding. - if self.tp > 1: + if self.enable_embedding_tp: input_mask = input_mask.expand_dims(-1) output_parallel = ops.mul(output_parallel, input_mask) @@ -175,10 +177,10 @@ class VocabParallelEmbedding(nn.Cell): output = output.reshape(bs, -1, hidden) return output - if self.tp == 1: - output = output_parallel - else: + if self.enable_embedding_tp: output = ops.AllReduce(group=self.group)(output_parallel) + else: + output = output_parallel return output @@ -189,24 +191,18 @@ class VocabParallelEmbedding(nn.Cell): else: out_strategy = layout("dp", "cp", "None") - if config.vocab_emb_dp or (self.num_embeddings % self.tp != 0): - self.embedding_morph.shard( - in_strategy=( - layout("dp", "cp",), - layout("None", "None"), - ), - out_strategy=( - out_strategy, - ) - ) + if self.enable_embedding_tp: + embedding_strategy = layout("tp", "None") else: - self.embedding_morph.shard( - in_strategy=( - layout("dp", "cp"), - layout("tp", "None"), - ), - out_strategy=(out_strategy,), - ) + embedding_strategy = layout("None", "None") + + self.embedding_morph.shard( + in_strategy=( + layout("dp", "cp"), + embedding_strategy, + ), + out_strategy=(out_strategy,), + ) def sharding_propagation(self, config: TransformerConfig): pass @@ -258,7 +254,7 @@ class ColumnParallelLinear(nn.Cell): transpose_b: bool = True, bias_init: Callable = None ): - super(ColumnParallelLinear, self).__init__() + super().__init__() if gather_output: raise NotImplementedError("For ColumnParallelLinear, `gather_output` is not supported for now") if stride > 1: @@ -470,7 +466,7 @@ class RowParallelLinear(nn.Cell): transpose_b: bool = True, bias_init: Callable = None ): - super(RowParallelLinear, self).__init__() + super().__init__() if input_is_parallel: raise NotImplementedError("For RowParallelLinear, `input_is_parallel` is not supported for now") if stride > 1: diff --git a/mindformers/parallel_core/transformer_config.py b/mindformers/parallel_core/transformer_config.py index fa8238769caa54643a25c32f95f0ceec52da8e02..c0e0ddd14cf2e46b8aaf4e81f28111073cf8775d 100644 --- a/mindformers/parallel_core/transformer_config.py +++ b/mindformers/parallel_core/transformer_config.py @@ -435,6 +435,10 @@ class TransformerConfig(ModelParallelConfig, MFModelConfig): if not isinstance(self.attention_dropout, float) or not 0 <= self.attention_dropout < 1: raise ValueError(f"attention_dropout should be a float within [0, 1), but get {self.attention_dropout}.") + if self.vocab_emb_dp: + logger.warning("vocab_emb_dp is not supported in MCore, it will be converted to False automatically.") + self.vocab_emb_dp = False + if self.pad_token_id is None: self.pad_token_id = 0