From 5f8a346172debd3969da7dc23e9f0e38bd73099f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A7=BB=E5=8A=A8=E4=B9=9D=E5=A4=A9-=E7=8E=8B=E7=A3=8A?= <1025472991@qq.com> Date: Wed, 15 Oct 2025 17:46:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0MaxVioglobal=E6=8C=87?= =?UTF-8?q?=E6=A0=87=E6=89=93=E5=8D=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindformers/core/callback/callback.py | 28 ++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/mindformers/core/callback/callback.py b/mindformers/core/callback/callback.py index fb39707b2..ef39bf58c 100644 --- a/mindformers/core/callback/callback.py +++ b/mindformers/core/callback/callback.py @@ -2446,6 +2446,19 @@ class TopkBiasBalanceCallback(Callback): self.topk_bias_update_rate = topk_bias_update_rate self.zeros_tensor = ms.Tensor(np.zeros([expert_num]), ms.float32) + def _calculate_max_vio_global(self, expert_load_data): + """Calculate MaxVioglobal metric""" + expert_load_np = expert_load_data.asnumpy() + avg_load = self.acc_step_over_expert_num.asnumpy()[0] + + if avg_load > 0: # Avoid division by zero error + relative_deviations = np.abs(expert_load_np - avg_load) / avg_load + max_vio_global = np.max(relative_deviations) + else: + max_vio_global = 0.0 + + return max_vio_global + def _update_topk_bias(self, network): """update topk bias tensor during training.""" while hasattr(network, "network"): @@ -2463,11 +2476,14 @@ class TopkBiasBalanceCallback(Callback): ) return if self.update_topk_bias_flag: - for layer in network.model.layers: + for layer_idx, layer in enumerate(network.model.layers): if hasattr(layer.feed_forward, "routed_experts"): + # Store current layer's expert load data for MaxVioglobal calculation + current_layer_expert_load = None if hasattr(layer.feed_forward.routed_experts, "router"): expert_load_data = \ layer.feed_forward.routed_experts.router.router.expert_load.value() + current_layer_expert_load = expert_load_data if expert_load_data.sum() > 0: err = self.afb_sub(self.acc_step_over_expert_num, expert_load_data) topk_bias_new = self.afb_add( @@ -2480,6 +2496,7 @@ class TopkBiasBalanceCallback(Callback): self.zeros_tensor) else: expert_load_data = layer.feed_forward.routed_experts.expert_load.value() + current_layer_expert_load = expert_load_data if expert_load_data.sum() > 0: err = self.afb_sub(self.acc_step_over_expert_num, expert_load_data) topk_bias_new = self.afb_add( @@ -2489,6 +2506,15 @@ class TopkBiasBalanceCallback(Callback): self.assign(layer.feed_forward.routed_experts.topk_bias, topk_bias_new) self.assign(layer.feed_forward.routed_experts.expert_load, self.zeros_tensor) + if current_layer_expert_load is not None and current_layer_expert_load.sum() > 0: + # Calculate MaxVioglobal metric + max_vio_global = self._calculate_max_vio_global(current_layer_expert_load) + + # Print MaxVioglobal metric for current layer + print(f"Layer {layer_idx} - MaxVioglobal: {max_vio_global:.6f}, " + f"Expert Load: {current_layer_expert_load.asnumpy()}, " + f"Avg Load: {self.acc_step_over_expert_num.asnumpy()[0]:.2f}") + def on_train_step_end(self, run_context): """update expert bias at the end of step.""" cb_params = run_context.original_args() -- Gitee