diff --git a/mindformers/parallel_core/training_graph/transformer/moe/ffn.py b/mindformers/parallel_core/training_graph/transformer/moe/ffn.py index e4a54710f55ac51eea06d666661624da7f8cf075..3559d65798a7f4a0f053dda74ac79ec5c392dbaa 100644 --- a/mindformers/parallel_core/training_graph/transformer/moe/ffn.py +++ b/mindformers/parallel_core/training_graph/transformer/moe/ffn.py @@ -67,7 +67,6 @@ class FFNGroupedGEMM(nn.Cell): self.compute_dtype = config.compute_dtype self.param_init_type = config.params_dtype self.moe_token_dispatcher_type = config.moe_token_dispatcher_type - self.init_method = config.init_method self.ep = config.expert_model_parallel_size self.dp = config.data_parallel_size * config.tensor_model_parallel_size @@ -76,10 +75,10 @@ class FFNGroupedGEMM(nn.Cell): # The weight2's shape in Megatron-LM GroupedMLP is (hidden_size, num_local_experts * moe_ffn_hidden_size) # To load same weight in Megatron-LM, we need to reshape the weight1 and weight2 to its shape. self.weight1 = Parameter( - self.init_method([self.num_local_experts * self.hidden_size, self.moe_ffn_hidden_size * 2]), + config.init_method([self.num_local_experts * self.hidden_size, self.moe_ffn_hidden_size * 2]), name='w1') self.weight2 = Parameter( - self.init_method([self.num_local_experts * self.moe_ffn_hidden_size, self.hidden_size]), + config.output_layer_init_method([self.num_local_experts * self.moe_ffn_hidden_size, self.hidden_size]), name='w2') # init token dispatcher