diff --git a/mindformers/parallel_core/training_graph/device_matrix.py b/mindformers/parallel_core/training_graph/device_matrix.py index d1cb1bc0a87f2af02057d0eec8204c1b0bafae15..bbb1b3f710d40d90abc0e8827a70a3878dcfea37 100644 --- a/mindformers/parallel_core/training_graph/device_matrix.py +++ b/mindformers/parallel_core/training_graph/device_matrix.py @@ -170,7 +170,7 @@ class MoeLayoutManager(LayoutManager): moe_comm_group = { "cp_dp": ("cp", "dp_ex_ep", "ep"), "dp": ("dp_ex_ep", "ep"), - "dp_cp": ("dp_ex_ep", "ep", "cp"), + "dp_cp": ("dp_ex_ep", "ep"), } if self._layout is None: @@ -223,10 +223,11 @@ class MoeLayoutManager(LayoutManager): parallel_config = self.get_parallel_config(config) dp = parallel_config['dp'] * parallel_config['tp'] ep = parallel_config['ep'] - dp_ex_ep = dp // ep + cp = parallel_config['cp'] + dp_ex_ep = dp // ep * cp - dev_mat = (dp_ex_ep, ep, parallel_config['cp']) - self._layout = Layout(dev_mat, ("dp_ex_ep", "ep", "cp")) + dev_mat = (dp_ex_ep, ep) + self._layout = Layout(dev_mat, ("dp_ex_ep", "ep")) self._layout_type = "dp_cp_tp_ep" return self._layout diff --git a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py index 5b1c4d91387673cefe2ab654c74e64ea82a529f7..76c3e45ef6d16f648d31be8c3ddd7d76c60547a8 100644 --- a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py +++ b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py @@ -117,7 +117,7 @@ class VocabParallelEmbedding(nn.Cell): def _init_embedding_rearrangement(self): """embedding rearrangement""" self.rank_id = get_rank() - self.tensor_model_parallel_rank = self.rank_id // self.cp % self.tp + self.tensor_model_parallel_rank = self.rank_id % self.tp ( self.vocab_start_index, self.vocab_end_index, diff --git a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py index 254e783a33ef55dabd4e8976ff1bf90cd6fb3bf7..8ed5574511121ab9cd98dc0cdad7135be3ae0562 100644 --- a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py +++ b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py @@ -219,7 +219,6 @@ def parallel_train_pp2_mp2_ep2_zbv(): TEST_MAP = { - 'parallel_train_dp2_mp2_cp2_ep2': parallel_train_dp2_mp2_cp2_ep2, 'parallel_train_dp2_pp2_ep2_tnd': parallel_train_dp2_pp2_ep2_tnd, "parallel_train_dp2_mp2_ep2_calculate_per_token_loss_and_print_seperate_loss": parallel_train_dp2_mp2_ep2_calculate_per_token_loss_and_print_seperate_loss, diff --git a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py index b8c533b9de30c0b2d1f13f395e83f592a57124a3..7f015166e6930372b900ca0a0af981fd5d57d345 100644 --- a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py +++ b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py @@ -17,7 +17,6 @@ import os from multiprocessing.pool import Pool from pathlib import Path import random -import pytest from mindformers.tools.logger import logger from tests.st.test_multi_cards_cases.utils import TaskType @@ -52,7 +51,6 @@ class TestDeepseekV3: self.run_script_path = self.sh_path / "run_deepseek3.py" assert self.run_script_path.exists(), f"Run script not found: {self.run_script_path}" - @pytest.mark.level0 def test_eight_card_configurations(self): """Test eight cards for DeepseekV3.""" port_id = int(os.environ.get("ASCEND_PORT_ID", random.randint(50000, 65535)))