登录
注册
开源
企业版
高校版
搜索
帮助中心
使用条款
关于我们
开源
企业版
高校版
私有云
模力方舟
AI 队友
登录
注册
轻量养虾,开箱即用!低 Token + 稳定算力,Gitee & 模力方舟联合出品的 PocketClaw 正式开售!点击了解详情~
代码拉取完成,页面将自动刷新
仓库状态说明
开源项目
>
人工智能
>
AI-人工智能
&&
捐赠
捐赠前请先登录
取消
前往登录
扫描微信二维码支付
取消
支付完成
支付提示
将跳转至支付宝完成支付
确定
取消
Watch
不关注
关注所有动态
仅关注版本发行动态
关注但不提醒动态
105
Star
1.4K
Fork
977
GVP
MindSpore
/
mindformers
关闭
代码
Issues
159
Pull Requests
103
Wiki
统计
流水线
服务
质量分析
Jenkins for Gitee
腾讯云托管
腾讯云 Serverless
悬镜安全
阿里云 SAE
Codeblitz
SBOM
开发画像分析
我知道了,不再自动展开
更新失败,请稍后重试!
移除标识
内容风险标识
本任务被
标识为内容中包含有代码安全 Bug 、隐私泄露等敏感信息,仓库外成员不可访问
HF配置迁移:copy HF模型配置py文件,实现装饰器机制动态注册MF独有参数功能和拦截无需在MF中使用的HF参数
DONE
#ICF30C
Question
Lin
成员
创建于
2025-06-13 17:52
<!-- Welcome to ask questions and discuss with other members in MindSpore community --> 设计思路: 1. copy HF ModelConfig源码,修改PretrainedConfig和logger导入信息,删除不必要的代码逻辑,参数及定义均保持不变 2. 显示定义装饰器动态注册MF中MLATransformerConfig支持的自定义参数 2. 显示定义装饰器对MF使用不到的参数进行统一拦截并打印,定义拦截参数的类型,可默认预置不同模型的默认拦截参数,支持自定义增加拦截参数 要求规范: 1. 显示打印不支持的HF Config参数,传入预置模型过滤类型和自定义过滤参数和对应过滤说明,不传入时默认过滤HF已知的不支持参数; 2. 动态注册MFModelConfig,显示写明额外支持的MF入参 3. 装饰器在__init__处装饰,同时如果不传任何参数的话,要注意加(),如@register_mf_parameter()或@IgnoreAndDeleteParameterDecorator() 装饰器实现demo,命名model_config_demo.py 代码样例 ``` import inspect from dataclasses import dataclass, asdict from typing import List, Tuple from functools import wraps def scatter_multi_mapping_keys_to_mapping(mapping): """ Expand multiple mapping relationships contained in `convert_map`. Args: mapping (dict): A dict contains the mapping to convert the keys of `model_config`. Returns: A dict with expand of all mappings. """ new_mapping = {} for k, v in mapping.items(): if not isinstance(k, tuple): new_mapping[k] = v continue for multi_key in k: new_mapping[multi_key] = v return new_mapping def validate_ignore_parameter_format(ignore_list: List[Tuple[str, str]] = None): """ 校验忽略参数列表的格式是否符合规范 参数: ignore_list: 要校验的忽略参数列表 异常: TypeError: 当数据类型不符合规范时抛出 ValueError: 当数据结构不符合规范时抛出 """ if ignore_list is None: return [] # 1. 检查是否为列表 if not isinstance(ignore_list, list): raise TypeError(f"IGNORE_COMMON_HF_PARAMETER must be a list, got {type(ignore_list).__name__}") # 2. 检查每个元素是否为元组 for i, item in enumerate(ignore_list): if not isinstance(item, tuple): raise TypeError( f"Item at index {i} must be a tuple, got {type(item).__name__}. " f"Expected format: [('param_name', 'reason'), ...]" ) # 3. 检查元组长度是否为2 for i, item in enumerate(ignore_list): if len(item) != 2: raise ValueError( f"Tuple at index {i} must have exactly 2 elements, got {len(item)}. " f"Expected format: ('param_name', 'reason')" ) # 4. 检查第一个元素是否为字符串 for i, (param_name, _) in enumerate(ignore_list): if not isinstance(param_name, str): raise TypeError( f"First element in tuple at index {i} must be a string (parameter name), " f"got {type(param_name).__name__}" ) # 5. 检查第二个元素是否为有效类型 for i, (_, reason) in enumerate(ignore_list): if not isinstance(reason, str): raise TypeError( f"Second element in tuple at index {i} must be a string or have a string representation, " f"got {type(reason).__name__}" ) return ignore_list @dataclass class NOT_SUPPORTED_INFO: useless = "Useless" # useless for MF not_implemented = "NotImplemented" # not supported for now # 随意写的过滤参数,需要重新梳理 IGNORE_COMMON_HF_PARAMETER = \ [('torch_dtype', 'Useless, replace by compute_dtype'), ('use_cache', 'Useless, enable kv_cache by default'), ('transformers_version', NOT_SUPPORTED_INFO.useless), ('use_sliding_window', NOT_SUPPORTED_INFO.not_implemented), ] # 随意写的过滤参数,需要重新梳理 IGNORE_HF_MODEL_CONFIG_MAPPING = scatter_multi_mapping_keys_to_mapping({ ("qwen2", "qwen2.5", "qwen3"): [('decoder_sparse_step', NOT_SUPPORTED_INFO.not_implemented), ('mlp_only_layers', NOT_SUPPORTED_INFO.not_implemented), ('norm_topk_prob', NOT_SUPPORTED_INFO.not_implemented), ('output_router_logits', NOT_SUPPORTED_INFO.not_implemented), ('router_aux_loss_coef', NOT_SUPPORTED_INFO.not_implemented), ("max_window_layers", NOT_SUPPORTED_INFO.not_implemented)], "deepseek-v3": [], }) @dataclass class MFModelConfig: """ 定义MF训练、推理使用的参数,区别于HuggingFace的相关参数集合 """ seq_length: int = None """Model Seq Length""" is_dynamic: bool = False """Whether model is dynamic shape.""" pad_token_id: int = 0 """Model pad token id.""" ignore_token_id: int = -100 """Model ignore token id when training.""" compute_dtype: str = "bfloat16" """Linear layer compute dtype.""" layernorm_compute_dtype: str = "float32" """LayerNorm compute dtype.""" rotary_dtype: str = "float32" """Custom rotary position embedding compute dtype.""" use_eod_reset: bool = False """Whether to use eod reset.""" use_flash_attention: bool = True """If true, use flash attention for the attention layer.""" class IgnoreAndDeleteParameterDecorator: """类装饰器用于拦截并打印__init__中不支持的参数""" def __init__(self, ignore_type: str = None, extra_ignore_param: List[Tuple[str, str]] = None): self.ignore_type = ignore_type self.extra_ignore_param = validate_ignore_parameter_format(extra_ignore_param) # 添加类级别打印标记 self.printed_classes = set() def __call__(self, init_func): # 默认忽略的通用参数列表 ignore_info = IGNORE_COMMON_HF_PARAMETER.copy() if self.ignore_type: model_ignore = IGNORE_HF_MODEL_CONFIG_MAPPING.get(self.ignore_type, []) ignore_info.extend(model_ignore) ignore_info.extend(self.extra_ignore_param) # 提取所有要忽略的参数名 ignore_param_names = [item[0] for item in ignore_info] # 获取原始签名入参 sig = inspect.signature(init_func) all_parameters_kwargs = dict() for name, param in sig.parameters.items(): if param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD): continue if name in ('self', 'kwargs'): continue all_parameters_kwargs.setdefault(name, param.default) @wraps(init_func) def wrapper(self_instance, *args, **kwargs): # 合并__init__默认签名参数和自定义传入参数 merge_kwargs = {**kwargs, **all_parameters_kwargs} # 删除传入的忽略参数 for param_name in ignore_param_names: if param_name in merge_kwargs: merge_kwargs.pop(param_name) # 获取当前类名 class_name = self_instance.__class__.__name__ # 检查是否已为该类打印过 if class_name in self.printed_classes: result = init_func(self_instance, *args, **merge_kwargs) # 删除需要忽略参数的属性 for param_name in ignore_param_names: if hasattr(self_instance, param_name): delattr(self_instance, param_name) return result # 标记该类已打印 self.printed_classes.add(class_name) # 打印不支持的参数表格 logger.warning(f"Found unsupported huggingface arguments in {self_instance.__class__.__name__}:") # 计算列宽 max_key_len = max(len(str(item[0])) for item in ignore_info) + 2 max_val_len = max(len(str(item[1])) for item in ignore_info) + 2 # 创建表格边框 border = f"+{'-' * (max_key_len + 2)}+{'-' * (max_val_len + 2)}+" # 打印表头 logger.warning(border) logger.warning(f"| {'Argument'.ljust(max_key_len)} | {'Status-Info'.ljust(max_val_len)} |") logger.warning(f"|:{'-' * (max_key_len + 1)}|:{'-' * (max_val_len + 1)}|") # 打印参数行 for arg, value in ignore_info: arg_str = str(arg).ljust(max_key_len) val_str = str(value).ljust(max_val_len) logger.warning(f"| {arg_str} | {val_str} |") logger.warning(border) # 调用原始初始化方法 result = init_func(self_instance, *args, **merge_kwargs) # 删除需要忽略参数的属性 for param_name in ignore_param_names: if hasattr(self_instance, param_name): delattr(self_instance, param_name) return result return wrapper def register_mf_parameter(mf_model_kwargs=None): """ 装饰器工厂函数,用于自定义 __init__ 方法的 kwargs 参数: mf_model_kwargs: 默认的关键字参数,将被添加到或覆盖原始 kwargs 返回: 装饰器函数 """ mf_model_kwargs = asdict(mf_model_kwargs) \ if mf_model_kwargs is not None is None else asdict(MFModelConfig()) def decorator(init_func): def wrapper(self, *args, **kwargs): # 合并默认参数和传入参数,传入参数优先级更高 merged_kwargs = {**mf_model_kwargs, **kwargs} return init_func(self, *args, **merged_kwargs) return wrapper return decorator ``` Huggingface原始copy模型配置代码,configuration_deepseek.py,做以下增量标准修改: ``` # from transformers.configuration_utils import PretrainedConfig # 修改1:替换成MindFormers框架定义的PretrainedConfig from mindformers.models.configuration_utils import PretrainedConfig # from transformers.utils import logging # 修改2:替换成MindFormers框架的logger函数 from mindformers.tools.logger import logger # logger = logging.get_logger(__name__) 修改3:删除该日志定义逻辑,MF中无需使用 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class DeepseekV3Config(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the DeepSeek-V3. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 129280): Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DeepseekV3Model`] hidden_size (`int`, *optional*, defaults to 4096): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 11008): Dimension of the MLP representations. moe_intermediate_size (`int`, *optional*, defaults to 1407): Dimension of the MoE representations. num_hidden_layers (`int`, *optional*, defaults to 32): Number of hidden layers in the Transformer decoder. num_nextn_predict_layers (`int`, *optional*, defaults to 1): Number of nextn predict layers in the DeepSeekV3 Model. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the Transformer decoder. n_shared_experts (`int`, *optional*, defaults to None): Number of shared experts, None means dense model. n_routed_experts (`int`, *optional*, defaults to None): Number of routed experts, None means dense model. routed_scaling_factor (`float`, *optional*, defaults to 1.0): Scaling factor or routed experts. topk_method (`str`, *optional*, defaults to `gready`): Topk method used in routed gate. n_group (`int`, *optional*, defaults to None): Number of groups for routed experts. topk_group (`int`, *optional*, defaults to None): Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups). num_experts_per_tok (`int`, *optional*, defaults to None): Number of selected experts, None means dense model. moe_layer_freq (`int`, *optional*, defaults to 1): The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers. first_k_dense_replace (`int`, *optional*, defaults to 0): Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head). \--k dense layers--/ norm_topk_prob (`bool`, *optional*, defaults to False): Whether to normalize the weights of the routed experts. scoring_func (`str`, *optional*, defaults to 'softmax'): Method of computing expert weights. aux_loss_alpha (`float`, *optional*, defaults to 0.001): Auxiliary loss weight coefficient. seq_aux = (`bool`, *optional*, defaults to True): Whether to compute the auxiliary loss for each individual sample. num_key_value_heads (`int`, *optional*): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. rms_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the rms normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. pad_token_id (`int`, *optional*): Padding token id. bos_token_id (`int`, *optional*, defaults to 1): Beginning of stream token id. eos_token_id (`int`, *optional*, defaults to 2): End of stream token id. pretraining_tp (`int`, *optional*, defaults to 1): Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is necessary to ensure exact reproducibility of the pretraining results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `max_position_embeddings` to the expected new maximum. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. ```python >>> from transformers import DeepseekV3Model, DeepseekV3Config >>> # Initializing a Deepseek-V3 style configuration >>> configuration = DeepseekV3Config() >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "deepseek_v3" keys_to_ignore_at_inference = ["past_key_values"] # 修改5:使用MF中提供的装饰器,拦截不支持的和注册MF自定义参数 from .model_config_demo import register_mf_parameter, IgnoreAndDeleteParameterDecorator @register_mf_parameter(mf_model_kwargs=MFModelConfig(compute_dtype='bf16', layernorm_compute_dtype="fp32")) @IgnoreAndDeleteParameterDecorator(ignore_type='qwen2', extra_ignore_param=[('n_shared_experts', NOT_SUPPORTED_INFO.useless)]) def __init__( self, vocab_size=129280, hidden_size=7168, intermediate_size=18432, moe_intermediate_size = 2048, num_hidden_layers=61, num_nextn_predict_layers=1, num_attention_heads=128, num_key_value_heads=128, n_shared_experts = 1, n_routed_experts = 256, ep_size = 1, routed_scaling_factor = 2.5, kv_lora_rank = 512, q_lora_rank = 1536, qk_rope_head_dim = 64, v_head_dim = 128, qk_nope_head_dim = 128, topk_method = 'noaux_tc', n_group = 8, topk_group = 4, num_experts_per_tok = 8, moe_layer_freq = 1, first_k_dense_replace = 3, norm_topk_prob = True, scoring_func = 'sigmoid', aux_loss_alpha = 0.001, seq_aux = True, hidden_act="silu", max_position_embeddings=4096, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=None, bos_token_id=0, eos_token_id=1, pretraining_tp=1, tie_word_embeddings=False, rope_theta=10000.0, rope_scaling=None, attention_bias=False, attention_dropout=0.0, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.moe_intermediate_size = moe_intermediate_size self.num_hidden_layers = num_hidden_layers self.num_nextn_predict_layers = num_nextn_predict_layers self.num_attention_heads = num_attention_heads self.n_shared_experts = n_shared_experts self.n_routed_experts = n_routed_experts self.ep_size = ep_size self.routed_scaling_factor = routed_scaling_factor self.kv_lora_rank = kv_lora_rank self.q_lora_rank = q_lora_rank self.qk_rope_head_dim = qk_rope_head_dim self.v_head_dim = v_head_dim self.qk_nope_head_dim = qk_nope_head_dim self.topk_method = topk_method self.n_group = n_group self.topk_group = topk_group self.num_experts_per_tok = num_experts_per_tok self.moe_layer_freq = moe_layer_freq self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.scoring_func = scoring_func self.aux_loss_alpha = aux_loss_alpha self.seq_aux = seq_aux # for backward compatibility if num_key_value_heads is None: num_key_value_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) ``` 执行效果: ``` config = DeepseekV3Config(use_flash_attention=True) print(config.to_dict()) config = DeepseekV3Config(use_cache=True) print(config.use_cache) ```  
<!-- Welcome to ask questions and discuss with other members in MindSpore community --> 设计思路: 1. copy HF ModelConfig源码,修改PretrainedConfig和logger导入信息,删除不必要的代码逻辑,参数及定义均保持不变 2. 显示定义装饰器动态注册MF中MLATransformerConfig支持的自定义参数 2. 显示定义装饰器对MF使用不到的参数进行统一拦截并打印,定义拦截参数的类型,可默认预置不同模型的默认拦截参数,支持自定义增加拦截参数 要求规范: 1. 显示打印不支持的HF Config参数,传入预置模型过滤类型和自定义过滤参数和对应过滤说明,不传入时默认过滤HF已知的不支持参数; 2. 动态注册MFModelConfig,显示写明额外支持的MF入参 3. 装饰器在__init__处装饰,同时如果不传任何参数的话,要注意加(),如@register_mf_parameter()或@IgnoreAndDeleteParameterDecorator() 装饰器实现demo,命名model_config_demo.py 代码样例 ``` import inspect from dataclasses import dataclass, asdict from typing import List, Tuple from functools import wraps def scatter_multi_mapping_keys_to_mapping(mapping): """ Expand multiple mapping relationships contained in `convert_map`. Args: mapping (dict): A dict contains the mapping to convert the keys of `model_config`. Returns: A dict with expand of all mappings. """ new_mapping = {} for k, v in mapping.items(): if not isinstance(k, tuple): new_mapping[k] = v continue for multi_key in k: new_mapping[multi_key] = v return new_mapping def validate_ignore_parameter_format(ignore_list: List[Tuple[str, str]] = None): """ 校验忽略参数列表的格式是否符合规范 参数: ignore_list: 要校验的忽略参数列表 异常: TypeError: 当数据类型不符合规范时抛出 ValueError: 当数据结构不符合规范时抛出 """ if ignore_list is None: return [] # 1. 检查是否为列表 if not isinstance(ignore_list, list): raise TypeError(f"IGNORE_COMMON_HF_PARAMETER must be a list, got {type(ignore_list).__name__}") # 2. 检查每个元素是否为元组 for i, item in enumerate(ignore_list): if not isinstance(item, tuple): raise TypeError( f"Item at index {i} must be a tuple, got {type(item).__name__}. " f"Expected format: [('param_name', 'reason'), ...]" ) # 3. 检查元组长度是否为2 for i, item in enumerate(ignore_list): if len(item) != 2: raise ValueError( f"Tuple at index {i} must have exactly 2 elements, got {len(item)}. " f"Expected format: ('param_name', 'reason')" ) # 4. 检查第一个元素是否为字符串 for i, (param_name, _) in enumerate(ignore_list): if not isinstance(param_name, str): raise TypeError( f"First element in tuple at index {i} must be a string (parameter name), " f"got {type(param_name).__name__}" ) # 5. 检查第二个元素是否为有效类型 for i, (_, reason) in enumerate(ignore_list): if not isinstance(reason, str): raise TypeError( f"Second element in tuple at index {i} must be a string or have a string representation, " f"got {type(reason).__name__}" ) return ignore_list @dataclass class NOT_SUPPORTED_INFO: useless = "Useless" # useless for MF not_implemented = "NotImplemented" # not supported for now # 随意写的过滤参数,需要重新梳理 IGNORE_COMMON_HF_PARAMETER = \ [('torch_dtype', 'Useless, replace by compute_dtype'), ('use_cache', 'Useless, enable kv_cache by default'), ('transformers_version', NOT_SUPPORTED_INFO.useless), ('use_sliding_window', NOT_SUPPORTED_INFO.not_implemented), ] # 随意写的过滤参数,需要重新梳理 IGNORE_HF_MODEL_CONFIG_MAPPING = scatter_multi_mapping_keys_to_mapping({ ("qwen2", "qwen2.5", "qwen3"): [('decoder_sparse_step', NOT_SUPPORTED_INFO.not_implemented), ('mlp_only_layers', NOT_SUPPORTED_INFO.not_implemented), ('norm_topk_prob', NOT_SUPPORTED_INFO.not_implemented), ('output_router_logits', NOT_SUPPORTED_INFO.not_implemented), ('router_aux_loss_coef', NOT_SUPPORTED_INFO.not_implemented), ("max_window_layers", NOT_SUPPORTED_INFO.not_implemented)], "deepseek-v3": [], }) @dataclass class MFModelConfig: """ 定义MF训练、推理使用的参数,区别于HuggingFace的相关参数集合 """ seq_length: int = None """Model Seq Length""" is_dynamic: bool = False """Whether model is dynamic shape.""" pad_token_id: int = 0 """Model pad token id.""" ignore_token_id: int = -100 """Model ignore token id when training.""" compute_dtype: str = "bfloat16" """Linear layer compute dtype.""" layernorm_compute_dtype: str = "float32" """LayerNorm compute dtype.""" rotary_dtype: str = "float32" """Custom rotary position embedding compute dtype.""" use_eod_reset: bool = False """Whether to use eod reset.""" use_flash_attention: bool = True """If true, use flash attention for the attention layer.""" class IgnoreAndDeleteParameterDecorator: """类装饰器用于拦截并打印__init__中不支持的参数""" def __init__(self, ignore_type: str = None, extra_ignore_param: List[Tuple[str, str]] = None): self.ignore_type = ignore_type self.extra_ignore_param = validate_ignore_parameter_format(extra_ignore_param) # 添加类级别打印标记 self.printed_classes = set() def __call__(self, init_func): # 默认忽略的通用参数列表 ignore_info = IGNORE_COMMON_HF_PARAMETER.copy() if self.ignore_type: model_ignore = IGNORE_HF_MODEL_CONFIG_MAPPING.get(self.ignore_type, []) ignore_info.extend(model_ignore) ignore_info.extend(self.extra_ignore_param) # 提取所有要忽略的参数名 ignore_param_names = [item[0] for item in ignore_info] # 获取原始签名入参 sig = inspect.signature(init_func) all_parameters_kwargs = dict() for name, param in sig.parameters.items(): if param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD): continue if name in ('self', 'kwargs'): continue all_parameters_kwargs.setdefault(name, param.default) @wraps(init_func) def wrapper(self_instance, *args, **kwargs): # 合并__init__默认签名参数和自定义传入参数 merge_kwargs = {**kwargs, **all_parameters_kwargs} # 删除传入的忽略参数 for param_name in ignore_param_names: if param_name in merge_kwargs: merge_kwargs.pop(param_name) # 获取当前类名 class_name = self_instance.__class__.__name__ # 检查是否已为该类打印过 if class_name in self.printed_classes: result = init_func(self_instance, *args, **merge_kwargs) # 删除需要忽略参数的属性 for param_name in ignore_param_names: if hasattr(self_instance, param_name): delattr(self_instance, param_name) return result # 标记该类已打印 self.printed_classes.add(class_name) # 打印不支持的参数表格 logger.warning(f"Found unsupported huggingface arguments in {self_instance.__class__.__name__}:") # 计算列宽 max_key_len = max(len(str(item[0])) for item in ignore_info) + 2 max_val_len = max(len(str(item[1])) for item in ignore_info) + 2 # 创建表格边框 border = f"+{'-' * (max_key_len + 2)}+{'-' * (max_val_len + 2)}+" # 打印表头 logger.warning(border) logger.warning(f"| {'Argument'.ljust(max_key_len)} | {'Status-Info'.ljust(max_val_len)} |") logger.warning(f"|:{'-' * (max_key_len + 1)}|:{'-' * (max_val_len + 1)}|") # 打印参数行 for arg, value in ignore_info: arg_str = str(arg).ljust(max_key_len) val_str = str(value).ljust(max_val_len) logger.warning(f"| {arg_str} | {val_str} |") logger.warning(border) # 调用原始初始化方法 result = init_func(self_instance, *args, **merge_kwargs) # 删除需要忽略参数的属性 for param_name in ignore_param_names: if hasattr(self_instance, param_name): delattr(self_instance, param_name) return result return wrapper def register_mf_parameter(mf_model_kwargs=None): """ 装饰器工厂函数,用于自定义 __init__ 方法的 kwargs 参数: mf_model_kwargs: 默认的关键字参数,将被添加到或覆盖原始 kwargs 返回: 装饰器函数 """ mf_model_kwargs = asdict(mf_model_kwargs) \ if mf_model_kwargs is not None is None else asdict(MFModelConfig()) def decorator(init_func): def wrapper(self, *args, **kwargs): # 合并默认参数和传入参数,传入参数优先级更高 merged_kwargs = {**mf_model_kwargs, **kwargs} return init_func(self, *args, **merged_kwargs) return wrapper return decorator ``` Huggingface原始copy模型配置代码,configuration_deepseek.py,做以下增量标准修改: ``` # from transformers.configuration_utils import PretrainedConfig # 修改1:替换成MindFormers框架定义的PretrainedConfig from mindformers.models.configuration_utils import PretrainedConfig # from transformers.utils import logging # 修改2:替换成MindFormers框架的logger函数 from mindformers.tools.logger import logger # logger = logging.get_logger(__name__) 修改3:删除该日志定义逻辑,MF中无需使用 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class DeepseekV3Config(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the DeepSeek-V3. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 129280): Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DeepseekV3Model`] hidden_size (`int`, *optional*, defaults to 4096): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 11008): Dimension of the MLP representations. moe_intermediate_size (`int`, *optional*, defaults to 1407): Dimension of the MoE representations. num_hidden_layers (`int`, *optional*, defaults to 32): Number of hidden layers in the Transformer decoder. num_nextn_predict_layers (`int`, *optional*, defaults to 1): Number of nextn predict layers in the DeepSeekV3 Model. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the Transformer decoder. n_shared_experts (`int`, *optional*, defaults to None): Number of shared experts, None means dense model. n_routed_experts (`int`, *optional*, defaults to None): Number of routed experts, None means dense model. routed_scaling_factor (`float`, *optional*, defaults to 1.0): Scaling factor or routed experts. topk_method (`str`, *optional*, defaults to `gready`): Topk method used in routed gate. n_group (`int`, *optional*, defaults to None): Number of groups for routed experts. topk_group (`int`, *optional*, defaults to None): Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups). num_experts_per_tok (`int`, *optional*, defaults to None): Number of selected experts, None means dense model. moe_layer_freq (`int`, *optional*, defaults to 1): The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers. first_k_dense_replace (`int`, *optional*, defaults to 0): Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head). \--k dense layers--/ norm_topk_prob (`bool`, *optional*, defaults to False): Whether to normalize the weights of the routed experts. scoring_func (`str`, *optional*, defaults to 'softmax'): Method of computing expert weights. aux_loss_alpha (`float`, *optional*, defaults to 0.001): Auxiliary loss weight coefficient. seq_aux = (`bool`, *optional*, defaults to True): Whether to compute the auxiliary loss for each individual sample. num_key_value_heads (`int`, *optional*): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. rms_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the rms normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. pad_token_id (`int`, *optional*): Padding token id. bos_token_id (`int`, *optional*, defaults to 1): Beginning of stream token id. eos_token_id (`int`, *optional*, defaults to 2): End of stream token id. pretraining_tp (`int`, *optional*, defaults to 1): Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is necessary to ensure exact reproducibility of the pretraining results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `max_position_embeddings` to the expected new maximum. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. ```python >>> from transformers import DeepseekV3Model, DeepseekV3Config >>> # Initializing a Deepseek-V3 style configuration >>> configuration = DeepseekV3Config() >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "deepseek_v3" keys_to_ignore_at_inference = ["past_key_values"] # 修改5:使用MF中提供的装饰器,拦截不支持的和注册MF自定义参数 from .model_config_demo import register_mf_parameter, IgnoreAndDeleteParameterDecorator @register_mf_parameter(mf_model_kwargs=MFModelConfig(compute_dtype='bf16', layernorm_compute_dtype="fp32")) @IgnoreAndDeleteParameterDecorator(ignore_type='qwen2', extra_ignore_param=[('n_shared_experts', NOT_SUPPORTED_INFO.useless)]) def __init__( self, vocab_size=129280, hidden_size=7168, intermediate_size=18432, moe_intermediate_size = 2048, num_hidden_layers=61, num_nextn_predict_layers=1, num_attention_heads=128, num_key_value_heads=128, n_shared_experts = 1, n_routed_experts = 256, ep_size = 1, routed_scaling_factor = 2.5, kv_lora_rank = 512, q_lora_rank = 1536, qk_rope_head_dim = 64, v_head_dim = 128, qk_nope_head_dim = 128, topk_method = 'noaux_tc', n_group = 8, topk_group = 4, num_experts_per_tok = 8, moe_layer_freq = 1, first_k_dense_replace = 3, norm_topk_prob = True, scoring_func = 'sigmoid', aux_loss_alpha = 0.001, seq_aux = True, hidden_act="silu", max_position_embeddings=4096, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=None, bos_token_id=0, eos_token_id=1, pretraining_tp=1, tie_word_embeddings=False, rope_theta=10000.0, rope_scaling=None, attention_bias=False, attention_dropout=0.0, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.moe_intermediate_size = moe_intermediate_size self.num_hidden_layers = num_hidden_layers self.num_nextn_predict_layers = num_nextn_predict_layers self.num_attention_heads = num_attention_heads self.n_shared_experts = n_shared_experts self.n_routed_experts = n_routed_experts self.ep_size = ep_size self.routed_scaling_factor = routed_scaling_factor self.kv_lora_rank = kv_lora_rank self.q_lora_rank = q_lora_rank self.qk_rope_head_dim = qk_rope_head_dim self.v_head_dim = v_head_dim self.qk_nope_head_dim = qk_nope_head_dim self.topk_method = topk_method self.n_group = n_group self.topk_group = topk_group self.num_experts_per_tok = num_experts_per_tok self.moe_layer_freq = moe_layer_freq self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.scoring_func = scoring_func self.aux_loss_alpha = aux_loss_alpha self.seq_aux = seq_aux # for backward compatibility if num_key_value_heads is None: num_key_value_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) ``` 执行效果: ``` config = DeepseekV3Config(use_flash_attention=True) print(config.to_dict()) config = DeepseekV3Config(use_cache=True) print(config.use_cache) ```  
评论 (
0
)
登录
后才可以发表评论
状态
DONE
TODO
ACCEPTED
WIP
VALIDATION
DONE
CLOSED
REJECTED
负责人
未设置
hsshuai
hss-shuai
负责人
协作者
+负责人
+协作者
标签
未设置
项目
未立项任务
未立项任务
里程碑
未关联里程碑
未关联里程碑
Pull Requests
未关联
未关联
关联的 Pull Requests 被合并后可能会关闭此 issue
分支
未关联
分支 (
-
)
标签 (
-
)
开始日期   -   截止日期
-
置顶选项
不置顶
置顶等级:高
置顶等级:中
置顶等级:低
优先级
不指定
严重
主要
次要
不重要
预计工期
(小时)
参与者(1)
Python
1
https://gitee.com/mindspore/mindformers.git
git@gitee.com:mindspore/mindformers.git
mindspore
mindformers
mindformers
点此查找更多帮助
搜索帮助
Git 命令在线学习
如何在 Gitee 导入 GitHub 仓库
Git 仓库基础操作
企业版和社区版功能对比
SSH 公钥设置
如何处理代码冲突
仓库体积过大,如何减小?
如何找回被删除的仓库数据
Gitee 产品配额说明
GitHub仓库快速导入Gitee及同步更新
什么是 Release(发行版)
将 PHP 项目自动发布到 packagist.org
评论
仓库举报
回到顶部
登录提示
该操作需登录 Gitee 帐号,请先登录后再操作。
立即登录
没有帐号,去注册