From 5089af0312a3b7674ddaca0a22111337167de337 Mon Sep 17 00:00:00 2001 From: yiyison Date: Sat, 25 Oct 2025 17:00:16 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=9D=83=E9=87=8D=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test_gpt_weight_consistency.py | 161 +-- .../st/test_ut/test_models/weight_struct.json | 946 +++++++++++++----- 2 files changed, 774 insertions(+), 333 deletions(-) diff --git a/tests/st/test_ut/test_models/test_gpt_weight_consistency.py b/tests/st/test_ut/test_models/test_gpt_weight_consistency.py index 802b859304..12beb5cf7b 100644 --- a/tests/st/test_ut/test_models/test_gpt_weight_consistency.py +++ b/tests/st/test_ut/test_models/test_gpt_weight_consistency.py @@ -20,14 +20,29 @@ from datetime import datetime from typing import Dict, List, Tuple import pytest +from mindformers import build_context from mindformers.parallel_core.training_graph.base_models.gpt.gpt_model import GPTModel as GPTModelTrain -from mindformers.parallel_core.training_graph.base_models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec as get_gpt_decoder_block_spec_train from mindformers.parallel_core.training_graph.base_models.gpt.gpt_layer_specs import get_gpt_mtp_block_spec +from mindformers.parallel_core.training_graph.base_models.gpt.gpt_layer_specs import \ + get_gpt_decoder_block_spec as get_gpt_decoder_block_spec_train +from mindformers.parallel_core.inference.base_models.gpt.gpt_layer_specs import \ + get_gpt_decoder_block_spec as get_gpt_decoder_block_spec_infer from mindformers.parallel_core.inference.base_models.gpt.gpt_model import GPTModel as GPTModelInfer -from mindformers.parallel_core.inference.base_models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec as get_gpt_decoder_block_spec_infer -from mindformers.parallel_core.process_group_config import ModelCommProcessGroups +from mindformers.parallel_core.inference.quantization.golden_stick.a8w8 import A8W8LinearMethod +from mindformers.parallel_core.inference.quantization.golden_stick.a8dynw4 import A8W4DynamicLinearMethod +from mindformers.parallel_core.inference.quantization.golden_stick.a8dynw8 import A8W8DynamicLinearMethod +from mindformers.parallel_core.inference.quantization.golden_stick.config import GoldenStickConfig from mindformers.parallel_core.transformer_config import MLATransformerConfig -from mindformers import build_context +from mindformers.parallel_core.process_group_config import ModelCommProcessGroups + + +class DummyGoldenStickConfig(GoldenStickConfig): + def get_quant_method(self, layer, prefix): + if "experts" in prefix: + return A8W4DynamicLinearMethod(self) + if "self_attn" in prefix: + return A8W8LinearMethod(self) + return A8W8DynamicLinearMethod(self) class GPTModelWeightConsistencyTest: @@ -38,7 +53,7 @@ class GPTModelWeightConsistencyTest: self.golden_file = self.test_dir / "weight_struct.json" # Test configuration - self.config = { + self.base_config = { "hidden_size": 128, "ffn_hidden_size": 256, "num_attention_heads": 4, @@ -51,29 +66,68 @@ class GPTModelWeightConsistencyTest: "data_parallel_size": 1, "normalization": "RMSNorm", "hidden_act": "silu", - "position_embedding_type": "rope", + "position_embedding_type": "learned_absolute", "add_bias_linear": False, "gated_linear_unit": True, "mla_qkv_concat": False, - "use_fused_mla": False, + "use_fused_mla": True, "use_flash_attention": True, "use_legacy": False, - "num_moe_experts": 2, + "shared_expert_num": 2, "moe_grouped_gemm": True, + "moe_router_score_function": "sigmoid", + "moe_router_enable_expert_bias": True, + "use_shared_expert_gating": False, "mtp_num_layers": 1, "first_k_dense_replace": 1, } - build_context(self.config) - self.config.pop("use_legacy") - self.config.pop("local_rank") - self.config.pop("device_num") - - def _build_model(self, model_type: str) -> object: + build_context(self.base_config) + self.base_config.pop("use_legacy") + self.base_config.pop("local_rank") + self.base_config.pop("device_num") + + self.train_configurations = [ + { + "multi_latent_attention": False, + "mla_qkv_concat": False, + "num_moe_experts": None + }, + { + "multi_latent_attention": True, + "mla_qkv_concat": False, + "num_moe_experts": 2 + }, + { + "multi_latent_attention": True, + "mla_qkv_concat": True, + "num_moe_experts": None + }, + ] + + self.infer_configurations = [ + { + "multi_latent_attention": False, + "num_moe_experts": None + }, + { + "multi_latent_attention": True, + "num_moe_experts": 2 + }, + ] + + def _build_model(self, + model_type: str, + multi_latent_attention: bool = False, + mla_qkv_concat: bool = False, + num_moe_experts: int = None) -> object: """Build real GPTModel.""" # pylint: disable=unexpected-keyword-arg - config = MLATransformerConfig(**self.config) + config = MLATransformerConfig(**self.base_config) + config.multi_latent_attention = multi_latent_attention + config.mla_qkv_concat = mla_qkv_concat + config.num_moe_experts = num_moe_experts + if model_type == "train": - # Build training version model transformer_layer_spec = get_gpt_decoder_block_spec_train(config) mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec) model = GPTModelTrain( @@ -86,8 +140,13 @@ class GPTModelWeightConsistencyTest: ) elif model_type == "infer": # Build inference version model + config.position_embedding_type = "rope" transformer_layer_spec = get_gpt_decoder_block_spec_infer(config, config.normalization) model_comm_pgs = ModelCommProcessGroups.get_default_model_comm_pgs() + quant_config = DummyGoldenStickConfig.from_config({}) + quant_config.fa3_quant = True + quant_config.fa3_quant_layer = {0} + quant_config.full_config = {"group_size": 128} model = GPTModelInfer( config=config, transformer_layer_spec=transformer_layer_spec, @@ -95,6 +154,7 @@ class GPTModelWeightConsistencyTest: max_sequence_length=config.max_position_embeddings, position_embedding_type=config.position_embedding_type, model_comm_pgs=model_comm_pgs, + quant_config=quant_config ) else: raise ValueError("model_type only support `train` or `infer`") @@ -103,14 +163,20 @@ class GPTModelWeightConsistencyTest: def _extract_weight_structure(self, model: object) -> Dict[str, List[int]]: """Extract weight structure from model.""" weights = {} - if hasattr(model, 'parameters_and_names'): for name, param in model.parameters_and_names(): if param.data is not None: weights[name] = list(param.data.shape) - return weights + def _generate_golden_standard(self, model_type: str, golden_data, configurations): + for i, cf in enumerate(configurations): + model = self._build_model(model_type=model_type, **cf) + weights = self._extract_weight_structure(model) + golden_data[f"{model_type}_version"][f"configuration_{i}"] = {} + golden_data[f"{model_type}_version"][f"configuration_{i}"]["weights"] = weights + golden_data[f"{model_type}_version"][f"configuration_{i}"]["weight_count"] = len(weights) + def generate_golden_standard(self) -> Dict: """Generate golden weight standard.""" print("Generating golden weight standard...") @@ -119,32 +185,30 @@ class GPTModelWeightConsistencyTest: "train_version": {}, "infer_version": {}, "metadata": { - "config": self.config, "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } } # Training version - train_model = self._build_model("train") - train_weights = self._extract_weight_structure(train_model) - - golden_data["train_version"]["weights"] = train_weights - golden_data["train_version"]["weight_count"] = len(train_weights) + self._generate_golden_standard("train", golden_data, self.train_configurations) # Inference version - infer_model = self._build_model("infer") - infer_weights = self._extract_weight_structure(infer_model) - - golden_data["infer_version"]["weights"] = infer_weights - golden_data["infer_version"]["weight_count"] = len(infer_weights) + self._generate_golden_standard("infer", golden_data, self.infer_configurations) # Save to file with open(self.golden_file, 'w', encoding='utf-8') as f: json.dump(golden_data, f, indent=2) - print(f"Golden standard saved: {len(train_weights)} train weights, {len(infer_weights)} infer weights") return golden_data + def _test_weight_consistency(self, model_type: str, all_differences, golden_data, configurations): + for i, cf in enumerate(configurations): + model = self._build_model(model_type=model_type, **cf) + current_weights = self._extract_weight_structure(model) + golden_weights = golden_data[f"{model_type}_version"][f"configuration_{i}"]["weights"] + differences = self._compare_weights(current_weights, golden_weights, model_type) + all_differences.extend(differences) + def test_weight_consistency(self) -> Tuple[bool, List[str]]: """Test weight consistency against golden standard.""" if not self.golden_file.exists(): @@ -160,20 +224,10 @@ class GPTModelWeightConsistencyTest: all_differences = [] # Test training version - train_model = self._build_model("train") - current_train_weights = self._extract_weight_structure(train_model) - - golden_train_weights = golden_data["train_version"]["weights"] - differences = self._compare_weights(current_train_weights, golden_train_weights, "TRAIN") - all_differences.extend(differences) + self._test_weight_consistency("train", all_differences, golden_data, self.train_configurations) # Test inference version - infer_model = self._build_model("infer") - current_infer_weights = self._extract_weight_structure(infer_model) - - golden_infer_weights = golden_data["infer_version"]["weights"] - differences = self._compare_weights(current_infer_weights, golden_infer_weights, "INFER") - all_differences.extend(differences) + self._test_weight_consistency("infer", all_differences, golden_data, self.infer_configurations) return all_differences @@ -223,7 +277,7 @@ def golden_data(weight_tester): @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_onecard -def test_golden_standard(weight_tester, golden_data): +def test_golden_standard(weight_tester): """ Feature: Test golden standard Description: Verify that the golden standard file exists and contains complete and valid structure @@ -237,27 +291,6 @@ def test_golden_standard(weight_tester, golden_data): assert "infer_version" in data, "Golden standard should contain infer version" assert "metadata" in data, "Golden standard should contain metadata" - # Test training version weight structure - assert "train_version" in golden_data, "Golden data should contain train version" - train_data = golden_data["train_version"] - assert "weights" in train_data, "Train version should contain weights" - assert "weight_count" in train_data, "Train version should have weight count" - weights = train_data["weights"] - assert weights, "Training version should have weights" - - # Test inference version weight structure. - assert "infer_version" in golden_data, "Golden data should contain infer version" - infer_data = golden_data["infer_version"] - assert "weights" in infer_data, "Infer version should contain weights" - assert "weight_count" in infer_data, "Infer version should have weight count" - weights = infer_data["weights"] - assert weights, "Inference version should have weights" - - # Test that metadata is complete - metadata = golden_data["metadata"] - assert "config" in metadata, "Metadata should contain config" - assert "generated_at" in metadata, "Metadata should contain generation timestamp" - @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training diff --git a/tests/st/test_ut/test_models/weight_struct.json b/tests/st/test_ut/test_models/weight_struct.json index 91e87c47b4..8f4b58f440 100644 --- a/tests/st/test_ut/test_models/weight_struct.json +++ b/tests/st/test_ut/test_models/weight_struct.json @@ -1,279 +1,687 @@ { "train_version": { - "weights": { - "embedding.word_embeddings.weight": [ - 1000, - 128 - ], - "embedding.embedding_dropout.seed": [], - "embedding.embedding_dropout.offset": [], - "decoder.layers.0.input_layernorm.weight": [ - 128 - ], - "decoder.layers.0.self_attention.linear_proj.weight": [ - 128, - 512 - ], - "decoder.layers.0.self_attention.linear_q_down_proj.weight": [ - 512, - 128 - ], - "decoder.layers.0.self_attention.linear_q_up_proj.weight": [ - 768, - 512 - ], - "decoder.layers.0.self_attention.linear_kv_down_proj.weight": [ - 576, - 128 - ], - "decoder.layers.0.self_attention.linear_kv_up_proj.weight": [ - 1024, - 512 - ], - "decoder.layers.0.pre_mlp_layernorm.weight": [ - 128 - ], - "decoder.layers.0.mlp.linear_fc1.weight": [ - 512, - 128 - ], - "decoder.layers.0.mlp.linear_fc2.weight": [ - 128, - 256 - ], - "decoder.layers.0.hidden_states_dropout.seed": [], - "decoder.layers.0.hidden_states_dropout.offset": [], - "decoder.layers.1.input_layernorm.weight": [ - 128 - ], - "decoder.layers.1.self_attention.linear_proj.weight": [ - 128, - 512 - ], - "decoder.layers.1.self_attention.linear_q_down_proj.weight": [ - 512, - 128 - ], - "decoder.layers.1.self_attention.linear_q_up_proj.weight": [ - 768, - 512 - ], - "decoder.layers.1.self_attention.linear_kv_down_proj.weight": [ - 576, - 128 - ], - "decoder.layers.1.self_attention.linear_kv_up_proj.weight": [ - 1024, - 512 - ], - "decoder.layers.1.pre_mlp_layernorm.weight": [ - 128 - ], - "decoder.layers.1.mlp.router.weight": [ - 2, - 128 - ], - "decoder.layers.1.mlp.router.fi_accu": [ - 2 - ], - "decoder.layers.1.mlp.experts.weight1": [ - 256, - 512 - ], - "decoder.layers.1.mlp.experts.weight2": [ - 512, - 128 - ], - "decoder.layers.1.hidden_states_dropout.seed": [], - "decoder.layers.1.hidden_states_dropout.offset": [], - "decoder.final_layernorm.weight": [ - 128 - ], - "mtp.layers.0.enorm.weight": [ - 128 - ], - "mtp.layers.0.hnorm.weight": [ - 128 - ], - "mtp.layers.0.eh_proj.weight": [ - 128, - 256 - ], - "mtp.layers.0.transformer_layer.input_layernorm.weight": [ - 128 - ], - "mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [ - 128, - 512 - ], - "mtp.layers.0.transformer_layer.self_attention.linear_q_down_proj.weight": [ - 512, - 128 - ], - "mtp.layers.0.transformer_layer.self_attention.linear_q_up_proj.weight": [ - 768, - 512 - ], - "mtp.layers.0.transformer_layer.self_attention.linear_kv_down_proj.weight": [ - 576, - 128 - ], - "mtp.layers.0.transformer_layer.self_attention.linear_kv_up_proj.weight": [ - 1024, - 512 - ], - "mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [ - 128 - ], - "mtp.layers.0.transformer_layer.mlp.router.weight": [ - 2, - 128 - ], - "mtp.layers.0.transformer_layer.mlp.router.fi_accu": [ - 2 - ], - "mtp.layers.0.transformer_layer.mlp.experts.weight1": [ - 256, - 512 - ], - "mtp.layers.0.transformer_layer.mlp.experts.weight2": [ - 512, - 128 - ], - "mtp.layers.0.transformer_layer.hidden_states_dropout.seed": [], - "mtp.layers.0.transformer_layer.hidden_states_dropout.offset": [], - "mtp.layers.0.final_layernorm.weight": [ - 128 - ], - "mtp.embedding.embedding_dropout.seed": [], - "mtp.embedding.embedding_dropout.offset": [], - "output_layer.weight": [ - 1000, - 128 - ] + "configuration_0": { + "weights": { + "embedding.word_embeddings.weight": [ + 1000, + 128 + ], + "embedding.position_embeddings.weight": [ + 128, + 128 + ], + "embedding.embedding_dropout.seed": [], + "embedding.embedding_dropout.offset": [], + "decoder.layers.0.input_layernorm.weight": [ + 128 + ], + "decoder.layers.0.self_attention.linear_proj.weight": [ + 128, + 128 + ], + "decoder.layers.0.self_attention.linear_qkv.weight": [ + 384, + 128 + ], + "decoder.layers.0.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.0.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.0.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.0.hidden_states_dropout.seed": [], + "decoder.layers.0.hidden_states_dropout.offset": [], + "decoder.layers.1.input_layernorm.weight": [ + 128 + ], + "decoder.layers.1.self_attention.linear_proj.weight": [ + 128, + 128 + ], + "decoder.layers.1.self_attention.linear_qkv.weight": [ + 384, + 128 + ], + "decoder.layers.1.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.1.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.1.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.1.hidden_states_dropout.seed": [], + "decoder.layers.1.hidden_states_dropout.offset": [], + "decoder.final_layernorm.weight": [ + 128 + ], + "mtp.layers.0.enorm.weight": [ + 128 + ], + "mtp.layers.0.hnorm.weight": [ + 128 + ], + "mtp.layers.0.eh_proj.weight": [ + 128, + 256 + ], + "mtp.layers.0.transformer_layer.input_layernorm.weight": [ + 128 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [ + 128, + 128 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_qkv.weight": [ + 384, + 128 + ], + "mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [ + 128 + ], + "mtp.layers.0.transformer_layer.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "mtp.layers.0.transformer_layer.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "mtp.layers.0.transformer_layer.hidden_states_dropout.seed": [], + "mtp.layers.0.transformer_layer.hidden_states_dropout.offset": [], + "mtp.layers.0.final_layernorm.weight": [ + 128 + ], + "mtp.embedding.embedding_dropout.seed": [], + "mtp.embedding.embedding_dropout.offset": [], + "output_layer.weight": [ + 1000, + 128 + ] + }, + "weight_count": 36 }, - "weight_count": 48 + "configuration_1": { + "weights": { + "embedding.word_embeddings.weight": [ + 1000, + 128 + ], + "embedding.position_embeddings.weight": [ + 128, + 128 + ], + "embedding.embedding_dropout.seed": [], + "embedding.embedding_dropout.offset": [], + "decoder.layers.0.input_layernorm.weight": [ + 128 + ], + "decoder.layers.0.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "decoder.layers.0.self_attention.linear_q_down_proj.weight": [ + 512, + 128 + ], + "decoder.layers.0.self_attention.linear_q_up_proj.weight": [ + 768, + 512 + ], + "decoder.layers.0.self_attention.linear_kv_down_proj.weight": [ + 576, + 128 + ], + "decoder.layers.0.self_attention.linear_kv_up_proj.weight": [ + 1024, + 512 + ], + "decoder.layers.0.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.0.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.0.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.0.hidden_states_dropout.seed": [], + "decoder.layers.0.hidden_states_dropout.offset": [], + "decoder.layers.1.input_layernorm.weight": [ + 128 + ], + "decoder.layers.1.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "decoder.layers.1.self_attention.linear_q_down_proj.weight": [ + 512, + 128 + ], + "decoder.layers.1.self_attention.linear_q_up_proj.weight": [ + 768, + 512 + ], + "decoder.layers.1.self_attention.linear_kv_down_proj.weight": [ + 576, + 128 + ], + "decoder.layers.1.self_attention.linear_kv_up_proj.weight": [ + 1024, + 512 + ], + "decoder.layers.1.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.1.mlp.router.weight": [ + 2, + 128 + ], + "decoder.layers.1.mlp.router.expert_bias": [ + 2 + ], + "decoder.layers.1.mlp.router.expert_load": [ + 2 + ], + "decoder.layers.1.mlp.router.fi_accu": [ + 2 + ], + "decoder.layers.1.mlp.experts.weight1": [ + 256, + 512 + ], + "decoder.layers.1.mlp.experts.weight2": [ + 512, + 128 + ], + "decoder.layers.1.mlp.shared_experts.linear_fc1.weight": [ + 1024, + 128 + ], + "decoder.layers.1.mlp.shared_experts.linear_fc2.weight": [ + 128, + 512 + ], + "decoder.layers.1.hidden_states_dropout.seed": [], + "decoder.layers.1.hidden_states_dropout.offset": [], + "decoder.final_layernorm.weight": [ + 128 + ], + "mtp.layers.0.enorm.weight": [ + 128 + ], + "mtp.layers.0.hnorm.weight": [ + 128 + ], + "mtp.layers.0.eh_proj.weight": [ + 128, + 256 + ], + "mtp.layers.0.transformer_layer.input_layernorm.weight": [ + 128 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_q_down_proj.weight": [ + 512, + 128 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_q_up_proj.weight": [ + 768, + 512 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_kv_down_proj.weight": [ + 576, + 128 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_kv_up_proj.weight": [ + 1024, + 512 + ], + "mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [ + 128 + ], + "mtp.layers.0.transformer_layer.mlp.router.weight": [ + 2, + 128 + ], + "mtp.layers.0.transformer_layer.mlp.router.expert_bias": [ + 2 + ], + "mtp.layers.0.transformer_layer.mlp.router.expert_load": [ + 2 + ], + "mtp.layers.0.transformer_layer.mlp.router.fi_accu": [ + 2 + ], + "mtp.layers.0.transformer_layer.mlp.experts.weight1": [ + 256, + 512 + ], + "mtp.layers.0.transformer_layer.mlp.experts.weight2": [ + 512, + 128 + ], + "mtp.layers.0.transformer_layer.mlp.shared_experts.linear_fc1.weight": [ + 1024, + 128 + ], + "mtp.layers.0.transformer_layer.mlp.shared_experts.linear_fc2.weight": [ + 128, + 512 + ], + "mtp.layers.0.transformer_layer.hidden_states_dropout.seed": [], + "mtp.layers.0.transformer_layer.hidden_states_dropout.offset": [], + "mtp.layers.0.final_layernorm.weight": [ + 128 + ], + "mtp.embedding.embedding_dropout.seed": [], + "mtp.embedding.embedding_dropout.offset": [], + "output_layer.weight": [ + 1000, + 128 + ] + }, + "weight_count": 57 + }, + "configuration_2": { + "weights": { + "embedding.word_embeddings.weight": [ + 1000, + 128 + ], + "embedding.position_embeddings.weight": [ + 128, + 128 + ], + "embedding.embedding_dropout.seed": [], + "embedding.embedding_dropout.offset": [], + "decoder.layers.0.input_layernorm.weight": [ + 128 + ], + "decoder.layers.0.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "decoder.layers.0.self_attention.linear_qb.weight": [ + 768, + 512 + ], + "decoder.layers.0.self_attention.linear_qkv.weight": [ + 1088, + 128 + ], + "decoder.layers.0.self_attention.linear_kvb.weight": [ + 1024, + 512 + ], + "decoder.layers.0.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.0.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.0.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.0.hidden_states_dropout.seed": [], + "decoder.layers.0.hidden_states_dropout.offset": [], + "decoder.layers.1.input_layernorm.weight": [ + 128 + ], + "decoder.layers.1.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "decoder.layers.1.self_attention.linear_qb.weight": [ + 768, + 512 + ], + "decoder.layers.1.self_attention.linear_qkv.weight": [ + 1088, + 128 + ], + "decoder.layers.1.self_attention.linear_kvb.weight": [ + 1024, + 512 + ], + "decoder.layers.1.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.1.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.1.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.1.hidden_states_dropout.seed": [], + "decoder.layers.1.hidden_states_dropout.offset": [], + "decoder.final_layernorm.weight": [ + 128 + ], + "mtp.layers.0.enorm.weight": [ + 128 + ], + "mtp.layers.0.hnorm.weight": [ + 128 + ], + "mtp.layers.0.eh_proj.weight": [ + 128, + 256 + ], + "mtp.layers.0.transformer_layer.input_layernorm.weight": [ + 128 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_qb.weight": [ + 768, + 512 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_qkv.weight": [ + 1088, + 128 + ], + "mtp.layers.0.transformer_layer.self_attention.linear_kvb.weight": [ + 1024, + 512 + ], + "mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [ + 128 + ], + "mtp.layers.0.transformer_layer.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "mtp.layers.0.transformer_layer.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "mtp.layers.0.transformer_layer.hidden_states_dropout.seed": [], + "mtp.layers.0.transformer_layer.hidden_states_dropout.offset": [], + "mtp.layers.0.final_layernorm.weight": [ + 128 + ], + "mtp.embedding.embedding_dropout.seed": [], + "mtp.embedding.embedding_dropout.offset": [], + "output_layer.weight": [ + 1000, + 128 + ] + }, + "weight_count": 42 + } }, "infer_version": { - "weights": { - "embedding.word_embeddings.weight": [ - 1000, - 128 - ], - "decoder.layers.0.input_layernorm.weight": [ - 128 - ], - "decoder.layers.0.self_attention.linear_proj.weight": [ - 128, - 512 - ], - "decoder.layers.0.self_attention.linear_qkv_down_proj.weight": [ - 1088, - 128 - ], - "decoder.layers.0.self_attention.linear_q_up_proj.weight": [ - 768, - 512 - ], - "decoder.layers.0.self_attention.linear_kv_up_proj.weight": [ - 1024, - 512 - ], - "decoder.layers.0.self_attention.q_layernorm.weight": [ - 512 - ], - "decoder.layers.0.self_attention.kv_layernorm.weight": [ - 512 - ], - "decoder.layers.0.pre_mlp_layernorm.weight": [ - 128 - ], - "decoder.layers.0.mlp.linear_fc1.weight": [ - 512, - 128 - ], - "decoder.layers.0.mlp.linear_fc2.weight": [ - 128, - 256 - ], - "decoder.layers.1.input_layernorm.weight": [ - 128 - ], - "decoder.layers.1.self_attention.linear_proj.weight": [ - 128, - 512 - ], - "decoder.layers.1.self_attention.linear_qkv_down_proj.weight": [ - 1088, - 128 - ], - "decoder.layers.1.self_attention.linear_q_up_proj.weight": [ - 768, - 512 - ], - "decoder.layers.1.self_attention.linear_kv_up_proj.weight": [ - 1024, - 512 - ], - "decoder.layers.1.self_attention.q_layernorm.weight": [ - 512 - ], - "decoder.layers.1.self_attention.kv_layernorm.weight": [ - 512 - ], - "decoder.layers.1.pre_mlp_layernorm.weight": [ - 128 - ], - "decoder.layers.1.mlp.router.weight": [ - 2, - 128 - ], - "decoder.layers.1.mlp.experts.weight1": [ - 2, - 128, - 512 - ], - "decoder.layers.1.mlp.experts.weight2": [ - 2, - 256, - 128 - ], - "decoder.final_layernorm.weight": [ - 128 - ], - "output_layer.weight": [ - 1000, - 128 - ] + "configuration_0": { + "weights": { + "embedding.word_embeddings.weight": [ + 1000, + 128 + ], + "decoder.layers.0.input_layernorm.weight": [ + 128 + ], + "decoder.layers.0.self_attention.linear_proj.weight": [ + 128, + 128 + ], + "decoder.layers.0.self_attention.linear_proj.w_scale": [ + 128 + ], + "decoder.layers.0.self_attention.linear_qkv.weight": [ + 384, + 128 + ], + "decoder.layers.0.self_attention.linear_qkv.w_scale": [ + 384 + ], + "decoder.layers.0.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.0.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.0.mlp.linear_fc1.w_scale": [ + 512 + ], + "decoder.layers.0.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.0.mlp.linear_fc2.w_scale": [ + 128 + ], + "decoder.layers.1.input_layernorm.weight": [ + 128 + ], + "decoder.layers.1.self_attention.linear_proj.weight": [ + 128, + 128 + ], + "decoder.layers.1.self_attention.linear_proj.w_scale": [ + 128 + ], + "decoder.layers.1.self_attention.linear_qkv.weight": [ + 384, + 128 + ], + "decoder.layers.1.self_attention.linear_qkv.w_scale": [ + 384 + ], + "decoder.layers.1.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.1.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.1.mlp.linear_fc1.w_scale": [ + 512 + ], + "decoder.layers.1.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.1.mlp.linear_fc2.w_scale": [ + 128 + ], + "decoder.final_layernorm.weight": [ + 128 + ], + "output_layer.weight": [ + 1000, + 128 + ] + }, + "weight_count": 23 }, - "weight_count": 24 + "configuration_1": { + "weights": { + "embedding.word_embeddings.weight": [ + 1000, + 128 + ], + "decoder.layers.0.self_attention.qnope_scale": [ + 4 + ], + "decoder.layers.0.self_attention.ctkv_scale": [ + 1 + ], + "decoder.layers.0.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "decoder.layers.0.self_attention.linear_proj.w_scale": [ + 128 + ], + "decoder.layers.0.self_attention.input_layernorm.weight": [ + 128 + ], + "decoder.layers.0.self_attention.linear_qkv_down_proj.weight": [ + 1088, + 128 + ], + "decoder.layers.0.self_attention.linear_qkv_down_proj.w_scale": [ + 1088 + ], + "decoder.layers.0.self_attention.linear_q_up_proj.weight": [ + 768, + 512 + ], + "decoder.layers.0.self_attention.linear_q_up_proj.w_scale": [ + 768 + ], + "decoder.layers.0.self_attention.linear_kv_up_proj.weight": [ + 1024, + 512 + ], + "decoder.layers.0.self_attention.linear_kv_up_proj.w_scale": [ + 1024 + ], + "decoder.layers.0.self_attention.q_layernorm.weight": [ + 512 + ], + "decoder.layers.0.self_attention.kv_layernorm.weight": [ + 512 + ], + "decoder.layers.0.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.0.mlp.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.0.mlp.linear_fc1.w_scale": [ + 512 + ], + "decoder.layers.0.mlp.linear_fc2.weight": [ + 128, + 256 + ], + "decoder.layers.0.mlp.linear_fc2.w_scale": [ + 128 + ], + "decoder.layers.1.self_attention.linear_proj.weight": [ + 128, + 512 + ], + "decoder.layers.1.self_attention.linear_proj.w_scale": [ + 128 + ], + "decoder.layers.1.self_attention.input_layernorm.weight": [ + 128 + ], + "decoder.layers.1.self_attention.linear_qkv_down_proj.weight": [ + 1088, + 128 + ], + "decoder.layers.1.self_attention.linear_qkv_down_proj.w_scale": [ + 1088 + ], + "decoder.layers.1.self_attention.linear_q_up_proj.weight": [ + 768, + 512 + ], + "decoder.layers.1.self_attention.linear_q_up_proj.w_scale": [ + 768 + ], + "decoder.layers.1.self_attention.linear_kv_up_proj.weight": [ + 1024, + 512 + ], + "decoder.layers.1.self_attention.linear_kv_up_proj.w_scale": [ + 1024 + ], + "decoder.layers.1.self_attention.q_layernorm.weight": [ + 512 + ], + "decoder.layers.1.self_attention.kv_layernorm.weight": [ + 512 + ], + "decoder.layers.1.pre_mlp_layernorm.weight": [ + 128 + ], + "decoder.layers.1.mlp.router.weight": [ + 2, + 128 + ], + "decoder.layers.1.mlp.router.expert_bias": [ + 2 + ], + "decoder.layers.1.mlp.experts.weight1": [ + 2, + 128, + 256 + ], + "decoder.layers.1.mlp.experts.weight2": [ + 2, + 256, + 64 + ], + "decoder.layers.1.mlp.experts.linear_fc1.gmm_bias": [ + 2, + 512 + ], + "decoder.layers.1.mlp.experts.linear_fc1.w_scale": [ + 2, + 1, + 512 + ], + "decoder.layers.1.mlp.experts.linear_fc2.gmm_bias": [ + 2, + 128 + ], + "decoder.layers.1.mlp.experts.linear_fc2.w_scale": [ + 2, + 2, + 128 + ], + "decoder.layers.1.mlp.shared_experts.linear_fc1.weight": [ + 512, + 128 + ], + "decoder.layers.1.mlp.shared_experts.linear_fc1.w_scale": [ + 512 + ], + "decoder.layers.1.mlp.shared_experts.linear_fc2.weight": [ + 128, + 512 + ], + "decoder.layers.1.mlp.shared_experts.linear_fc2.w_scale": [ + 128 + ], + "decoder.final_layernorm.weight": [ + 128 + ], + "output_layer.weight": [ + 1000, + 128 + ] + }, + "weight_count": 45 + } }, "metadata": { - "config": { - "hidden_size": 128, - "ffn_hidden_size": 256, - "num_attention_heads": 4, - "num_layers": 2, - "seq_length": 32, - "vocab_size": 1000, - "max_position_embeddings": 128, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "data_parallel_size": 1, - "normalization": "RMSNorm", - "hidden_act": "silu", - "position_embedding_type": "rope", - "add_bias_linear": false, - "gated_linear_unit": true, - "mla_qkv_concat": false, - "use_fused_mla": false, - "use_flash_attention": true, - "num_moe_experts": 2, - "moe_grouped_gemm": true, - "mtp_num_layers": 1, - "first_k_dense_replace": 1, - "parallel_config": {} - }, - "generated_at": "2025-10-17 22:34:54" + "generated_at": "2025-10-25 16:28:42" } } \ No newline at end of file -- Gitee