From f63f848ff25b533739e23f4eeb550a7e66e410c9 Mon Sep 17 00:00:00 2001 From: zhangyihuiben Date: Tue, 7 Oct 2025 17:23:34 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90master=E3=80=91=E3=80=90mcore=E3=80=91?= =?UTF-8?q?=E3=80=90bugfix=E3=80=91=20=E4=BF=AE=E5=A4=8Dcp=E5=BC=95?= =?UTF-8?q?=E8=B5=B7=E7=9A=84=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../parallel_core/training_graph/device_matrix.py | 9 +++++---- .../training_graph/tensor_parallel/layers.py | 2 +- .../test_model/test_deepseek3/run_deepseek3.py | 1 - .../test_model/test_deepseek3/test_deepseek3_train.py | 2 -- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/mindformers/parallel_core/training_graph/device_matrix.py b/mindformers/parallel_core/training_graph/device_matrix.py index d1cb1bc0a..bbb1b3f71 100644 --- a/mindformers/parallel_core/training_graph/device_matrix.py +++ b/mindformers/parallel_core/training_graph/device_matrix.py @@ -170,7 +170,7 @@ class MoeLayoutManager(LayoutManager): moe_comm_group = { "cp_dp": ("cp", "dp_ex_ep", "ep"), "dp": ("dp_ex_ep", "ep"), - "dp_cp": ("dp_ex_ep", "ep", "cp"), + "dp_cp": ("dp_ex_ep", "ep"), } if self._layout is None: @@ -223,10 +223,11 @@ class MoeLayoutManager(LayoutManager): parallel_config = self.get_parallel_config(config) dp = parallel_config['dp'] * parallel_config['tp'] ep = parallel_config['ep'] - dp_ex_ep = dp // ep + cp = parallel_config['cp'] + dp_ex_ep = dp // ep * cp - dev_mat = (dp_ex_ep, ep, parallel_config['cp']) - self._layout = Layout(dev_mat, ("dp_ex_ep", "ep", "cp")) + dev_mat = (dp_ex_ep, ep) + self._layout = Layout(dev_mat, ("dp_ex_ep", "ep")) self._layout_type = "dp_cp_tp_ep" return self._layout diff --git a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py index 5b1c4d913..76c3e45ef 100644 --- a/mindformers/parallel_core/training_graph/tensor_parallel/layers.py +++ b/mindformers/parallel_core/training_graph/tensor_parallel/layers.py @@ -117,7 +117,7 @@ class VocabParallelEmbedding(nn.Cell): def _init_embedding_rearrangement(self): """embedding rearrangement""" self.rank_id = get_rank() - self.tensor_model_parallel_rank = self.rank_id // self.cp % self.tp + self.tensor_model_parallel_rank = self.rank_id % self.tp ( self.vocab_start_index, self.vocab_end_index, diff --git a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py index 254e783a3..8ed557451 100644 --- a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py +++ b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/run_deepseek3.py @@ -219,7 +219,6 @@ def parallel_train_pp2_mp2_ep2_zbv(): TEST_MAP = { - 'parallel_train_dp2_mp2_cp2_ep2': parallel_train_dp2_mp2_cp2_ep2, 'parallel_train_dp2_pp2_ep2_tnd': parallel_train_dp2_pp2_ep2_tnd, "parallel_train_dp2_mp2_ep2_calculate_per_token_loss_and_print_seperate_loss": parallel_train_dp2_mp2_ep2_calculate_per_token_loss_and_print_seperate_loss, diff --git a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py index b8c533b9d..7f015166e 100644 --- a/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py +++ b/tests/st/test_multi_cards_cases/test_model/test_deepseek3/test_deepseek3_train.py @@ -17,7 +17,6 @@ import os from multiprocessing.pool import Pool from pathlib import Path import random -import pytest from mindformers.tools.logger import logger from tests.st.test_multi_cards_cases.utils import TaskType @@ -52,7 +51,6 @@ class TestDeepseekV3: self.run_script_path = self.sh_path / "run_deepseek3.py" assert self.run_script_path.exists(), f"Run script not found: {self.run_script_path}" - @pytest.mark.level0 def test_eight_card_configurations(self): """Test eight cards for DeepseekV3.""" port_id = int(os.environ.get("ASCEND_PORT_ID", random.randint(50000, 65535))) -- Gitee