From cb490b7666765e29983685942855a98edacf5819 Mon Sep 17 00:00:00 2001 From: niujunhao Date: Thu, 13 Nov 2025 21:03:20 +0800 Subject: [PATCH] fix expert init method. --- .../parallel_core/training_graph/transformer/moe/ffn.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mindformers/parallel_core/training_graph/transformer/moe/ffn.py b/mindformers/parallel_core/training_graph/transformer/moe/ffn.py index e4a54710f..3559d6579 100644 --- a/mindformers/parallel_core/training_graph/transformer/moe/ffn.py +++ b/mindformers/parallel_core/training_graph/transformer/moe/ffn.py @@ -67,7 +67,6 @@ class FFNGroupedGEMM(nn.Cell): self.compute_dtype = config.compute_dtype self.param_init_type = config.params_dtype self.moe_token_dispatcher_type = config.moe_token_dispatcher_type - self.init_method = config.init_method self.ep = config.expert_model_parallel_size self.dp = config.data_parallel_size * config.tensor_model_parallel_size @@ -76,10 +75,10 @@ class FFNGroupedGEMM(nn.Cell): # The weight2's shape in Megatron-LM GroupedMLP is (hidden_size, num_local_experts * moe_ffn_hidden_size) # To load same weight in Megatron-LM, we need to reshape the weight1 and weight2 to its shape. self.weight1 = Parameter( - self.init_method([self.num_local_experts * self.hidden_size, self.moe_ffn_hidden_size * 2]), + config.init_method([self.num_local_experts * self.hidden_size, self.moe_ffn_hidden_size * 2]), name='w1') self.weight2 = Parameter( - self.init_method([self.num_local_experts * self.moe_ffn_hidden_size, self.hidden_size]), + config.output_layer_init_method([self.num_local_experts * self.moe_ffn_hidden_size, self.hidden_size]), name='w2') # init token dispatcher -- Gitee