From 20574adb3f27d4c583387a4491aefa906b969735 Mon Sep 17 00:00:00 2001 From: lzy0920232 Date: Tue, 14 Oct 2025 14:52:24 +0800 Subject: [PATCH] code_bugfix_initialize --- mindformers/parallel_core/inference/parallel_state.py | 11 ++++++++--- .../training_graph/base_models/gpt/gpt_model.py | 3 ++- .../test_transformer/test_mlp/run_mlp.py | 2 +- .../run_shared_expert_mlp.py | 2 +- .../run_row_parallel_linear.py | 2 +- .../run_row_parallel_linear_with_lora.py | 2 +- .../test_self_attention/run_self_attention.py | 2 +- .../run_self_attention_megatron.py | 2 +- .../test_shared_experts/run_shared_expert_mlp.py | 2 +- .../run_multi_latent_attention.py | 2 +- .../run_multi_token_prediction.py | 2 +- .../test_transformer_block/run_transformer_block.py | 2 +- .../test_transformer_layer/run_transformer_layer.py | 2 +- 13 files changed, 21 insertions(+), 15 deletions(-) diff --git a/mindformers/parallel_core/inference/parallel_state.py b/mindformers/parallel_core/inference/parallel_state.py index d779013ce..d7c7def20 100644 --- a/mindformers/parallel_core/inference/parallel_state.py +++ b/mindformers/parallel_core/inference/parallel_state.py @@ -243,7 +243,8 @@ def _valid_parallel_config(tensor_model_parallel_size: int = 1, expert_model_parallel_size: int = 1, context_parallel_size: Optional[int] = 1, num_virtual_instance: int = 1, - order: str = "tp-ep-pp-dp",) -> None: + order: str = "tp-ep-pp-dp", + is_train: bool = False) -> None: """ Validate the model parallel configuration. """ world_size = get_group_size() total_model_size = data_parallel_size * tensor_model_parallel_size \ @@ -269,6 +270,8 @@ def _valid_parallel_config(tensor_model_parallel_size: int = 1, raise RuntimeError("order can not be empty.") if len(set(order_list)) != len(order_list): raise RuntimeError(f"Duplicate elements in order ({order}).") + if not is_train and context_parallel_size != 1: + raise ValueError(f"For now context parallel is not supported, but got cp={context_parallel_size}.") def initialize_moe_model_parallel(expert_model_parallel_size: int, @@ -352,7 +355,8 @@ def initialize_model_parallel(tensor_model_parallel_size: int = 1, expert_model_parallel_size: int = 1, context_parallel_size: Optional[int] = 1, num_virtual_instance: int = 1, - order: str = "tp-ep-pp-dp",) -> None: + order: str = "tp-ep-pp-dp", + is_train: bool = False) -> None: """Initialize model data parallel groups. Args: @@ -400,7 +404,8 @@ def initialize_model_parallel(tensor_model_parallel_size: int = 1, expert_model_parallel_size=expert_model_parallel_size, context_parallel_size=context_parallel_size, num_virtual_instance=num_virtual_instance, - order=order) + order=order, + is_train=is_train) rank_id = get_rank() world_size = get_group_size() diff --git a/mindformers/parallel_core/training_graph/base_models/gpt/gpt_model.py b/mindformers/parallel_core/training_graph/base_models/gpt/gpt_model.py index 3dd3357ed..faf83dad8 100644 --- a/mindformers/parallel_core/training_graph/base_models/gpt/gpt_model.py +++ b/mindformers/parallel_core/training_graph/base_models/gpt/gpt_model.py @@ -267,7 +267,8 @@ class GPTModel(nn.Cell): self.cp = config.context_parallel_size if config.context_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp, context_parallel_size=self.cp) + pipeline_model_parallel_size=self.pp, context_parallel_size=self.cp, + is_train=True) self.preprocess_labels_and_masks = PreprocessLabelsAndMasks(config) diff --git a/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_mlp/run_mlp.py b/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_mlp/run_mlp.py index 2dd5ca1ca..0da4a0ae1 100644 --- a/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_mlp/run_mlp.py +++ b/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_mlp/run_mlp.py @@ -67,7 +67,7 @@ class TestModel(nn.Cell): self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_moe/test_shared_experts_parallel/run_shared_expert_mlp.py b/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_moe/test_shared_experts_parallel/run_shared_expert_mlp.py index 638ec8d52..783881f76 100644 --- a/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_moe/test_shared_experts_parallel/run_shared_expert_mlp.py +++ b/tests/st/test_multi_cards_cases/test_parallel_core/test_train_graph/test_transformer/test_moe/test_shared_experts_parallel/run_shared_expert_mlp.py @@ -68,7 +68,7 @@ class TestModel(nn.Cell): self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) self.mlp = SharedExpertMLP(config=config, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_train/run_row_parallel_linear.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_train/run_row_parallel_linear.py index f199358ed..00b7b4524 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_train/run_row_parallel_linear.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_train/run_row_parallel_linear.py @@ -74,7 +74,7 @@ class RowParallelLinearRunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_with_lora/run_row_parallel_linear_with_lora.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_with_lora/run_row_parallel_linear_with_lora.py index d6c91c5c4..bbf9c8ce2 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_with_lora/run_row_parallel_linear_with_lora.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_tensor_parallel/test_row_parallel_linear_with_lora/run_row_parallel_linear_with_lora.py @@ -75,7 +75,7 @@ class RowParallelLinearWithLoRARunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention.py index 59d904086..3b7b0ce10 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention.py @@ -169,7 +169,7 @@ class SelfAttentionRunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention_megatron.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention_megatron.py index 1ba00acef..27c06bc35 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention_megatron.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_self_attention/run_self_attention_megatron.py @@ -97,7 +97,7 @@ class SelfAttentionMegatronRunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_moe/test_shared_experts/run_shared_expert_mlp.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_moe/test_shared_experts/run_shared_expert_mlp.py index ed5191040..4fffeeb75 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_moe/test_shared_experts/run_shared_expert_mlp.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_moe/test_shared_experts/run_shared_expert_mlp.py @@ -69,7 +69,7 @@ class TestModel(nn.Cell): self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_latent_attention/run_multi_latent_attention.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_latent_attention/run_multi_latent_attention.py index 47fc3d2b6..4420ec4dc 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_latent_attention/run_multi_latent_attention.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_latent_attention/run_multi_latent_attention.py @@ -80,7 +80,7 @@ class MLARunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_token_prediction/run_multi_token_prediction.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_token_prediction/run_multi_token_prediction.py index ab10ec0ea..eebc8aa9d 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_token_prediction/run_multi_token_prediction.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_multi_token_prediction/run_multi_token_prediction.py @@ -85,7 +85,7 @@ class MTPRunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/run_transformer_block.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/run_transformer_block.py index 1dba5fb28..9d39a40c5 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/run_transformer_block.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/run_transformer_block.py @@ -111,7 +111,7 @@ class TransformerLayerRunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) diff --git a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_layer/run_transformer_layer.py b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_layer/run_transformer_layer.py index 9041cfd02..a615b77b1 100644 --- a/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_layer/run_transformer_layer.py +++ b/tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_layer/run_transformer_layer.py @@ -105,7 +105,7 @@ class TransformerLayerRunner: self.pp = self.config.pipeline_model_parallel_size \ if self.config.pipeline_model_parallel_size is not None else 1 initialize_model_parallel(tensor_model_parallel_size=self.tp, data_parallel_size=self.dp, - pipeline_model_parallel_size=self.pp) + pipeline_model_parallel_size=self.pp, is_train=True) layout.init_layout(self.config) -- Gitee