From 51c208caaec3269058a4285f6a8370e7b79af905 Mon Sep 17 00:00:00 2001 From: mr-lin314 <798948055@qq.com> Date: Wed, 28 May 2025 10:46:36 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E6=B7=BB=E5=8A=A0qwenvl=20resharding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../vllm_adapter/megatron_weight_loaders.py | 35 ++ .../resharding/vllm_weight_container.py | 8 +- .../workers/resharding/weight_adaptor.py | 307 ++++++++++++++++++ 3 files changed, 348 insertions(+), 2 deletions(-) diff --git a/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py b/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py index a6264f29..22a1b90d 100644 --- a/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py +++ b/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py @@ -72,6 +72,38 @@ def qwen_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module, return vllm_model +def qwen_vl_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module, + infer_paralle_config: InferParallelConfig, hf_config: PretrainedConfig) -> nn.Module: + params_dict = dict(vllm_model.named_parameters()) + vision_config = type('obj', (object,), { + 'num_attention_heads': hf_config.vision_config.num_heads, + 'num_key_value_heads': hf_config.vision_config.num_heads, + }) + + for name, loaded_weight in actor_weights.items(): + if name not in params_dict.keys(): + continue + if "qkv" in name: + if 'visual' in name: + if name.endswith('.bias'): + q_weight, k_weight, v_weight = qkv_split_bias(loaded_weight, infer_paralle_config, vision_config) + loaded_weight.copy_(torch.cat([q_weight, k_weight, v_weight], dim=0)) + else: + q_weight, k_weight, v_weight = qkv_split_weight(loaded_weight, infer_paralle_config, vision_config) + loaded_weight.copy_(torch.cat([q_weight, k_weight, v_weight], dim=0)) + else: + if name.endswith('.bias'): + q_weight, k_weight, v_weight = qkv_split_bias(loaded_weight, infer_paralle_config, hf_config) + loaded_weight.copy_(torch.cat([q_weight, k_weight, v_weight], dim=0)) + else: + q_weight, k_weight, v_weight = qkv_split_weight(loaded_weight, infer_paralle_config, hf_config) + loaded_weight.copy_(torch.cat([q_weight, k_weight, v_weight], dim=0)) + + load_single_weight(params_dict, name, loaded_weight) + + return vllm_model + + def deepseek_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module, infer_paralle_config: InferParallelConfig, hf_config: PretrainedConfig ) -> nn.Module: @@ -176,6 +208,9 @@ MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY = { "LlamaForCausalLM": llama_megatron_core_weight_loader, "Qwen2ForCausalLM": qwen_megatron_weight_loader, "DeepseekV3ForCausalLM": deepseek_megatron_weight_loader, + "Qwen2VLForConditionalGeneration": qwen_vl_megatron_weight_loader, + "CustomQwen2VLForConditionalGeneration": qwen_vl_megatron_weight_loader, + "Qwen2_5_VLForConditionalGeneration": qwen_vl_megatron_weight_loader, } diff --git a/mindspeed_rl/workers/resharding/vllm_weight_container.py b/mindspeed_rl/workers/resharding/vllm_weight_container.py index a783e906..48c41b75 100644 --- a/mindspeed_rl/workers/resharding/vllm_weight_container.py +++ b/mindspeed_rl/workers/resharding/vllm_weight_container.py @@ -19,6 +19,7 @@ This file contains a Megatron style Hybrid Model that shares the weights of the import hashlib import re from functools import partial +from typing import List import torch import torch.distributed as dist @@ -156,6 +157,9 @@ class MegatronStyleVllmWeightContainer: def _build_num_layer_list(self, num_layer_list): if num_layer_list: + # multimodal num_layer_list is a list of lists, including vit and llm layers + if isinstance(num_layer_list[0], List): + return num_layer_list return [int(num_layers) for num_layers in num_layer_list.split(',')] if self._num_hidden_layers % self._pp_size != 0: raise ValueError("num_layers % pp_size == 0, please specify num_layer_list") @@ -290,7 +294,7 @@ class MegatronStyleVllmWeightContainer: self.origin_params_for_md5 = hashlib.md5() self.infer_params_for_md5 = [hashlib.md5() for _ in range(get_tp_allgather_world_size())] for hf_name, megatron_name in name_pairs: - if megatron_name.endswith("linear_fc1.weight"): + if not megatron_name.startswith("image_encoder") and megatron_name.endswith("linear_fc1.weight"): fc2_name = megatron_name.replace("linear_fc1", "linear_fc2") megatron_param_fc1 = dict(true_megatron_model.named_parameters())[megatron_name] megatron_param_fc2 = dict(true_megatron_model.named_parameters())[fc2_name] @@ -405,7 +409,7 @@ class MegatronStyleVllmWeightContainer: we can throw an error to force user disable TP HybridEngine. """ - if "linear_fc1.weight" in name: + if 'projector' not in name and 'linear_fc1' in name: # if the tensor is gate and proj gate_lst = [] up_lst = [] diff --git a/mindspeed_rl/workers/resharding/weight_adaptor.py b/mindspeed_rl/workers/resharding/weight_adaptor.py index 19ea46c2..178af327 100644 --- a/mindspeed_rl/workers/resharding/weight_adaptor.py +++ b/mindspeed_rl/workers/resharding/weight_adaptor.py @@ -235,10 +235,317 @@ class QwenMVWeightAdaptor(MegatronVLLMWeightAdaptor): super(QwenMVWeightAdaptor, self).__init__(model_config) +class Qwen2VLWeightAdaptor(MegatronVLLMWeightAdaptor): + """ + Megatron-vLLM WeightAdaptor for Qwen2VL model architectures. + """ + def __init__(self, model_config): + super(Qwen2VLWeightAdaptor, self).__init__(model_config) + self.params_mapping = [ + ("text_decoder.embedding.word_embeddings", "language_model.model.embed_tokens"), + ("text_decoder.decoder.layers.*.self_attention.linear_qkv", "language_model.model.layers.*.self_attn.qkv_proj"), + ("text_decoder.decoder.layers.*.self_attention.linear_proj", "language_model.model.layers.*.self_attn.o_proj"), + ("text_decoder.decoder.layers.*.input_layernorm", "language_model.model.layers.*.input_layernorm"), + ("text_decoder.decoder.layers.*.pre_mlp_layernorm", "language_model.model.layers.*.post_attention_layernorm"), + ("text_decoder.decoder.layers.*.mlp.linear_fc1", "language_model.model.layers.*.mlp.gate_up_proj"), + ("text_decoder.decoder.layers.*.mlp.linear_fc2", "language_model.model.layers.*.mlp.down_proj"), + ("text_decoder.decoder.final_layernorm", "language_model.model.norm"), + ("text_decoder.output_layer", "language_model.lm_head"), + ("image_encoder.encoder.patch_embed.proj", "visual.patch_embed.proj"), + ("image_encoder.encoder.blocks.layers.*.self_attention.linear_qkv", "visual.blocks.*.attn.qkv"), + ("image_encoder.encoder.blocks.layers.*.self_attention.linear_proj", "visual.blocks.*.attn.proj"), + ("image_encoder.encoder.blocks.layers.*.input_layernorm", "visual.blocks.*.norm1"), + ("image_encoder.encoder.blocks.layers.*.pre_mlp_layernorm", "visual.blocks.*.norm2"), + ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc1", "visual.blocks.*.mlp.fc1"), + ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc2", "visual.blocks.*.mlp.fc2"), + ("image_encoder.projector.layernorm", "visual.merger.ln_q"), + ("image_encoder.projector.encoder.linear_fc1", "visual.merger.mlp.0"), + ("image_encoder.projector.encoder.linear_fc2", "visual.merger.mlp.2"), + ] + + def replace_name_i2t(self, inference_name): + weight_suffix = "" + if inference_name.endswith(".weight"): + weight_suffix = ".weight" + base_name = inference_name[:-7] + elif inference_name.endswith(".bias"): + weight_suffix = ".bias" + base_name = inference_name[:-5] + else: + base_name = inference_name + + for megatron_pattern, vllm_pattern in self.params_mapping: + vllm_regex = vllm_pattern.replace("*", "(\\d+)") + match = re.match(f"^{vllm_regex}(.*)?$", base_name) + if match: + digits = match.groups() + extra_suffix = match.groups()[-1] if match.groups() and match.groups()[-1] is not None else "" + + megatron_result = megatron_pattern + for i, digit in enumerate(digits[:-1] if extra_suffix else digits): + if digit is not None: + megatron_result = megatron_result.replace("*", digit, 1) + + return megatron_result + extra_suffix + weight_suffix + + return inference_name + + @staticmethod + def _convert_global_to_local_index(name, layer_keyword, pp_layers): + """ + Convert global layer index to local layer index for a given layer type. + + Args: + name: Weight name containing layer information + layer_keyword: Layer type keyword ('blocks' for visual, 'layers' for language model) + pp_layers: List of layer counts per pipeline parallel rank + + Returns: + Updated weight name with local layer index + """ + split_name = name.split('.') + + # Find the position of layer keyword + layer_keyword_idx = -1 + for i, name_part in enumerate(split_name): + if name_part == layer_keyword: + layer_keyword_idx = i + break + + if layer_keyword_idx == -1: + return name + + layer_num_idx = layer_keyword_idx + 1 + if len(split_name) < layer_num_idx + 1 or not split_name[layer_num_idx].isdigit(): + raise ValueError(f'Invalid {layer_keyword} name: {split_name}') + + global_idx = int(split_name[layer_num_idx]) + + # Calculate local index + cumulative_layers = 0 + for layers_in_pp_rank in pp_layers: + if layers_in_pp_rank == 0: + continue + if cumulative_layers <= global_idx < cumulative_layers + layers_in_pp_rank: + local_index = global_idx - cumulative_layers + split_name[layer_num_idx] = str(local_index) + return '.'.join(split_name) + cumulative_layers += layers_in_pp_rank + + raise ValueError(f'Could not map {layer_keyword} {global_idx} to a local index with distribution {pp_layers}') + + @staticmethod + def global2local_layer(name, num_layer_list): + """ + Transform layer names from global space to local space for Qwen2VL models. + Supports both visual blocks and language model layers. + + Args: + name: Weight name to transform + num_layer_list: [img_pp_layers, llm_pp_layers] distribution + + Returns: + Transformed weight name with local layer indices + """ + img_pp_layers, llm_pp_layers = num_layer_list + + if name.startswith('visual') and 'blocks' in name: + return Qwen2VLWeightAdaptor._convert_global_to_local_index(name, 'blocks', img_pp_layers) + elif name.startswith('language_model') and 'layers' in name: + return Qwen2VLWeightAdaptor._convert_global_to_local_index(name, 'layers', llm_pp_layers) + + return name + + @staticmethod + def _categorize_weights(vllm_names): + """ + Categorize weight names by their types for easier processing. + + Args: + vllm_names: List of vLLM weight names + + Returns: + Dictionary containing categorized weight names + """ + visual_weights = [] + lang_weights = [] + visual_pre_layer_weights = [] + visual_post_layer_weights = [] + lang_pre_layer_weights = [] + lang_post_layer_weights = [] + + for name in vllm_names: + if name.startswith('visual'): + if 'blocks' not in name: + if 'patch_embed' in name: + visual_pre_layer_weights.append(name) + elif 'merger' in name: + visual_post_layer_weights.append(name) + else: + visual_weights.append(name) + elif name.startswith('language_model'): + if 'layers' not in name: + if 'embed_tokens' in name: + lang_pre_layer_weights.append(name) + else: + lang_post_layer_weights.append(name) + else: + lang_weights.append(name) + + return { + 'visual_weights': visual_weights, + 'lang_weights': lang_weights, + 'visual_pre_layer_weights': visual_pre_layer_weights, + 'visual_post_layer_weights': visual_post_layer_weights, + 'lang_pre_layer_weights': lang_pre_layer_weights, + 'lang_post_layer_weights': lang_post_layer_weights + } + + @staticmethod + def _calculate_layer_ranges(pp_layers): + """ + Calculate layer ranges for each pipeline parallel stage. + + Args: + pp_layers: List of layer counts per pipeline parallel rank + + Returns: + List of (start_layer, end_layer) tuples for each rank + """ + layer_ranges = [] + start_layer = 0 + for layers_in_pp_rank in pp_layers: + if layers_in_pp_rank > 0: + layer_ranges.append((start_layer, start_layer + layers_in_pp_rank - 1)) + start_layer += layers_in_pp_rank + else: + layer_ranges.append((-1, -1)) + return layer_ranges + + @staticmethod + def _find_last_rank(pp_layers): + """ + Find the last pipeline parallel rank that has non-zero layers. + """ + for i in range(len(pp_layers) - 1, -1, -1): + if pp_layers[i] > 0: + return i + return -1 + + @staticmethod + def _find_first_rank(pp_layers): + """ + Find the first pipeline parallel rank that has non-zero layers. + """ + for i in range(len(pp_layers)): + if pp_layers[i] > 0: + return i + return -1 + + @staticmethod + def _assign_layer_weights(weight_names_per_pp, weights, layer_ranges, layer_keyword): + """ + Assign layer weights to their corresponding pipeline parallel stages. + """ + for pp_rank, (start_layer, end_layer) in enumerate(layer_ranges): + if start_layer >= 0 and end_layer >= 0: + for name in weights: + match = re.match(rf'.*\.{layer_keyword}\.(\d+)', name) + if match: + layer_num = int(match.group(1)) + if start_layer <= layer_num <= end_layer: + weight_names_per_pp[pp_rank].append(name) + + @staticmethod + def get_weight_names_per_pp(layer_list, vllm_names): + """ + Get weight names for each pipeline parallel stage optimized for Qwen2VL models. + + Args: + layer_list: [img_pp_layers, llm_pp_layers] distribution + vllm_names: List of vLLM weight names + + Returns: + List of weight names for each pipeline parallel rank + """ + img_pp_layers, llm_pp_layers = layer_list + pp_size = len(img_pp_layers) + + weight_categories = Qwen2VLWeightAdaptor._categorize_weights(vllm_names) + + img_blocks_range = Qwen2VLWeightAdaptor._calculate_layer_ranges(img_pp_layers) + llm_layers_range = Qwen2VLWeightAdaptor._calculate_layer_ranges(llm_pp_layers) + + weight_names_per_pp = [[] for _ in range(pp_size)] + + last_img_rank = Qwen2VLWeightAdaptor._find_last_rank(img_pp_layers) + first_llm_rank = Qwen2VLWeightAdaptor._find_first_rank(llm_pp_layers) + last_llm_rank = Qwen2VLWeightAdaptor._find_last_rank(llm_pp_layers) + + # Process visual weights + for pp_rank in range(pp_size): + start_layer, end_layer = img_blocks_range[pp_rank] + + if start_layer == 0 and end_layer >= 0: + weight_names_per_pp[pp_rank].extend(weight_categories['visual_pre_layer_weights']) + + if pp_rank == last_img_rank: + weight_names_per_pp[pp_rank].extend(weight_categories['visual_post_layer_weights']) + + # Assign visual layer weights + Qwen2VLWeightAdaptor._assign_layer_weights( + weight_names_per_pp, weight_categories['visual_weights'], img_blocks_range, 'blocks' + ) + + # Process language model weights + for pp_rank in range(pp_size): + if pp_rank == first_llm_rank: + weight_names_per_pp[pp_rank].extend(weight_categories['lang_pre_layer_weights']) + + if pp_rank == last_llm_rank: + weight_names_per_pp[pp_rank].extend(weight_categories['lang_post_layer_weights']) + + # Assign language model layer weights + Qwen2VLWeightAdaptor._assign_layer_weights( + weight_names_per_pp, weight_categories['lang_weights'], llm_layers_range, 'layers' + ) + + return weight_names_per_pp + + +class Qwen2_5_VLWeightAdaptor(Qwen2VLWeightAdaptor): + def __init__(self, model_config): + super(Qwen2_5_VLWeightAdaptor, self).__init__(model_config) + self.params_mapping = [ + ("text_decoder.embedding.word_embeddings", "language_model.model.embed_tokens"), + ("text_decoder.decoder.layers.*.self_attention.linear_qkv", "language_model.model.layers.*.self_attn.qkv_proj"), + ("text_decoder.decoder.layers.*.self_attention.linear_proj", "language_model.model.layers.*.self_attn.o_proj"), + ("text_decoder.decoder.layers.*.input_layernorm", "language_model.model.layers.*.input_layernorm"), + ("text_decoder.decoder.layers.*.pre_mlp_layernorm", "language_model.model.layers.*.post_attention_layernorm"), + ("text_decoder.decoder.layers.*.mlp.linear_fc1", "language_model.model.layers.*.mlp.gate_up_proj"), + ("text_decoder.decoder.layers.*.mlp.linear_fc2", "language_model.model.layers.*.mlp.down_proj"), + ("text_decoder.decoder.final_layernorm", "language_model.model.norm"), + ("text_decoder.output_layer", "language_model.lm_head"), + ("image_encoder.encoder.patch_embed.proj", "visual.patch_embed.proj"), + ("image_encoder.encoder.blocks.layers.*.self_attention.linear_qkv", "visual.blocks.*.attn.qkv"), + ("image_encoder.encoder.blocks.layers.*.self_attention.linear_proj", "visual.blocks.*.attn.proj"), + ("image_encoder.encoder.blocks.layers.*.input_layernorm", "visual.blocks.*.norm1"), + ("image_encoder.encoder.blocks.layers.*.pre_mlp_layernorm", "visual.blocks.*.norm2"), + ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc1", "visual.blocks.*.mlp.gate_up_proj"), + ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc2", "visual.blocks.*.mlp.down_proj"), + ("image_encoder.projector.layernorm", "visual.merger.ln_q"), + ("image_encoder.projector.encoder.linear_fc1", "visual.merger.mlp.0"), + ("image_encoder.projector.encoder.linear_fc2", "visual.merger.mlp.2"), + ] + + WEIGHT_ADAPTOR_REGISTRY = { "Qwen2ForCausalLM": QwenMVWeightAdaptor, "DeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor, "DeepseekV2ForCausalLM": DeepSeekMVWeightAdaptor, + "Qwen2VLForConditionalGeneration": Qwen2VLWeightAdaptor, + "CustomQwen2VLForConditionalGeneration": Qwen2VLWeightAdaptor, + "Qwen2_5_VLForConditionalGeneration": Qwen2_5_VLWeightAdaptor, } -- Gitee From 250fcc3363a79e647f7dc476f5751ce29f030f5b Mon Sep 17 00:00:00 2001 From: mr-lin314 <798948055@qq.com> Date: Wed, 28 May 2025 17:05:47 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E6=B7=BB=E5=8A=A0qwenvl=20resharding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../vllm_adapter/megatron_weight_loaders.py | 2 - .../workers/resharding/weight_adaptor.py | 59 +++++-------------- 2 files changed, 14 insertions(+), 47 deletions(-) diff --git a/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py b/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py index 22a1b90d..ee4262d4 100644 --- a/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py +++ b/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py @@ -208,8 +208,6 @@ MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY = { "LlamaForCausalLM": llama_megatron_core_weight_loader, "Qwen2ForCausalLM": qwen_megatron_weight_loader, "DeepseekV3ForCausalLM": deepseek_megatron_weight_loader, - "Qwen2VLForConditionalGeneration": qwen_vl_megatron_weight_loader, - "CustomQwen2VLForConditionalGeneration": qwen_vl_megatron_weight_loader, "Qwen2_5_VLForConditionalGeneration": qwen_vl_megatron_weight_loader, } diff --git a/mindspeed_rl/workers/resharding/weight_adaptor.py b/mindspeed_rl/workers/resharding/weight_adaptor.py index 178af327..fc6e85fc 100644 --- a/mindspeed_rl/workers/resharding/weight_adaptor.py +++ b/mindspeed_rl/workers/resharding/weight_adaptor.py @@ -235,12 +235,9 @@ class QwenMVWeightAdaptor(MegatronVLLMWeightAdaptor): super(QwenMVWeightAdaptor, self).__init__(model_config) -class Qwen2VLWeightAdaptor(MegatronVLLMWeightAdaptor): - """ - Megatron-vLLM WeightAdaptor for Qwen2VL model architectures. - """ +class Qwen2_5_VLWeightAdaptor(MegatronVLLMWeightAdaptor): def __init__(self, model_config): - super(Qwen2VLWeightAdaptor, self).__init__(model_config) + super(Qwen2_5_VLWeightAdaptor, self).__init__(model_config) self.params_mapping = [ ("text_decoder.embedding.word_embeddings", "language_model.model.embed_tokens"), ("text_decoder.decoder.layers.*.self_attention.linear_qkv", "language_model.model.layers.*.self_attn.qkv_proj"), @@ -256,8 +253,8 @@ class Qwen2VLWeightAdaptor(MegatronVLLMWeightAdaptor): ("image_encoder.encoder.blocks.layers.*.self_attention.linear_proj", "visual.blocks.*.attn.proj"), ("image_encoder.encoder.blocks.layers.*.input_layernorm", "visual.blocks.*.norm1"), ("image_encoder.encoder.blocks.layers.*.pre_mlp_layernorm", "visual.blocks.*.norm2"), - ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc1", "visual.blocks.*.mlp.fc1"), - ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc2", "visual.blocks.*.mlp.fc2"), + ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc1", "visual.blocks.*.mlp.gate_up_proj"), + ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc2", "visual.blocks.*.mlp.down_proj"), ("image_encoder.projector.layernorm", "visual.merger.ln_q"), ("image_encoder.projector.encoder.linear_fc1", "visual.merger.mlp.0"), ("image_encoder.projector.encoder.linear_fc2", "visual.merger.mlp.2"), @@ -350,9 +347,9 @@ class Qwen2VLWeightAdaptor(MegatronVLLMWeightAdaptor): img_pp_layers, llm_pp_layers = num_layer_list if name.startswith('visual') and 'blocks' in name: - return Qwen2VLWeightAdaptor._convert_global_to_local_index(name, 'blocks', img_pp_layers) + return Qwen2_5_VLWeightAdaptor._convert_global_to_local_index(name, 'blocks', img_pp_layers) elif name.startswith('language_model') and 'layers' in name: - return Qwen2VLWeightAdaptor._convert_global_to_local_index(name, 'layers', llm_pp_layers) + return Qwen2_5_VLWeightAdaptor._convert_global_to_local_index(name, 'layers', llm_pp_layers) return name @@ -471,16 +468,16 @@ class Qwen2VLWeightAdaptor(MegatronVLLMWeightAdaptor): img_pp_layers, llm_pp_layers = layer_list pp_size = len(img_pp_layers) - weight_categories = Qwen2VLWeightAdaptor._categorize_weights(vllm_names) + weight_categories = Qwen2_5_VLWeightAdaptor._categorize_weights(vllm_names) - img_blocks_range = Qwen2VLWeightAdaptor._calculate_layer_ranges(img_pp_layers) - llm_layers_range = Qwen2VLWeightAdaptor._calculate_layer_ranges(llm_pp_layers) + img_blocks_range = Qwen2_5_VLWeightAdaptor._calculate_layer_ranges(img_pp_layers) + llm_layers_range = Qwen2_5_VLWeightAdaptor._calculate_layer_ranges(llm_pp_layers) weight_names_per_pp = [[] for _ in range(pp_size)] - last_img_rank = Qwen2VLWeightAdaptor._find_last_rank(img_pp_layers) - first_llm_rank = Qwen2VLWeightAdaptor._find_first_rank(llm_pp_layers) - last_llm_rank = Qwen2VLWeightAdaptor._find_last_rank(llm_pp_layers) + last_img_rank = Qwen2_5_VLWeightAdaptor._find_last_rank(img_pp_layers) + first_llm_rank = Qwen2_5_VLWeightAdaptor._find_first_rank(llm_pp_layers) + last_llm_rank = Qwen2_5_VLWeightAdaptor._find_last_rank(llm_pp_layers) # Process visual weights for pp_rank in range(pp_size): @@ -493,7 +490,7 @@ class Qwen2VLWeightAdaptor(MegatronVLLMWeightAdaptor): weight_names_per_pp[pp_rank].extend(weight_categories['visual_post_layer_weights']) # Assign visual layer weights - Qwen2VLWeightAdaptor._assign_layer_weights( + Qwen2_5_VLWeightAdaptor._assign_layer_weights( weight_names_per_pp, weight_categories['visual_weights'], img_blocks_range, 'blocks' ) @@ -506,45 +503,17 @@ class Qwen2VLWeightAdaptor(MegatronVLLMWeightAdaptor): weight_names_per_pp[pp_rank].extend(weight_categories['lang_post_layer_weights']) # Assign language model layer weights - Qwen2VLWeightAdaptor._assign_layer_weights( + Qwen2_5_VLWeightAdaptor._assign_layer_weights( weight_names_per_pp, weight_categories['lang_weights'], llm_layers_range, 'layers' ) return weight_names_per_pp -class Qwen2_5_VLWeightAdaptor(Qwen2VLWeightAdaptor): - def __init__(self, model_config): - super(Qwen2_5_VLWeightAdaptor, self).__init__(model_config) - self.params_mapping = [ - ("text_decoder.embedding.word_embeddings", "language_model.model.embed_tokens"), - ("text_decoder.decoder.layers.*.self_attention.linear_qkv", "language_model.model.layers.*.self_attn.qkv_proj"), - ("text_decoder.decoder.layers.*.self_attention.linear_proj", "language_model.model.layers.*.self_attn.o_proj"), - ("text_decoder.decoder.layers.*.input_layernorm", "language_model.model.layers.*.input_layernorm"), - ("text_decoder.decoder.layers.*.pre_mlp_layernorm", "language_model.model.layers.*.post_attention_layernorm"), - ("text_decoder.decoder.layers.*.mlp.linear_fc1", "language_model.model.layers.*.mlp.gate_up_proj"), - ("text_decoder.decoder.layers.*.mlp.linear_fc2", "language_model.model.layers.*.mlp.down_proj"), - ("text_decoder.decoder.final_layernorm", "language_model.model.norm"), - ("text_decoder.output_layer", "language_model.lm_head"), - ("image_encoder.encoder.patch_embed.proj", "visual.patch_embed.proj"), - ("image_encoder.encoder.blocks.layers.*.self_attention.linear_qkv", "visual.blocks.*.attn.qkv"), - ("image_encoder.encoder.blocks.layers.*.self_attention.linear_proj", "visual.blocks.*.attn.proj"), - ("image_encoder.encoder.blocks.layers.*.input_layernorm", "visual.blocks.*.norm1"), - ("image_encoder.encoder.blocks.layers.*.pre_mlp_layernorm", "visual.blocks.*.norm2"), - ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc1", "visual.blocks.*.mlp.gate_up_proj"), - ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc2", "visual.blocks.*.mlp.down_proj"), - ("image_encoder.projector.layernorm", "visual.merger.ln_q"), - ("image_encoder.projector.encoder.linear_fc1", "visual.merger.mlp.0"), - ("image_encoder.projector.encoder.linear_fc2", "visual.merger.mlp.2"), - ] - - WEIGHT_ADAPTOR_REGISTRY = { "Qwen2ForCausalLM": QwenMVWeightAdaptor, "DeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor, "DeepseekV2ForCausalLM": DeepSeekMVWeightAdaptor, - "Qwen2VLForConditionalGeneration": Qwen2VLWeightAdaptor, - "CustomQwen2VLForConditionalGeneration": Qwen2VLWeightAdaptor, "Qwen2_5_VLForConditionalGeneration": Qwen2_5_VLWeightAdaptor, } -- Gitee From 85164ce5a3de117744ba985823fb7239628f09cd Mon Sep 17 00:00:00 2001 From: mr-lin314 <798948055@qq.com> Date: Thu, 29 May 2025 11:48:00 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E6=B7=BB=E5=8A=A0qwenvl=20resharding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../workers/resharding/weight_adaptor.py | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mindspeed_rl/workers/resharding/weight_adaptor.py b/mindspeed_rl/workers/resharding/weight_adaptor.py index fc6e85fc..86a60d3d 100644 --- a/mindspeed_rl/workers/resharding/weight_adaptor.py +++ b/mindspeed_rl/workers/resharding/weight_adaptor.py @@ -240,21 +240,21 @@ class Qwen2_5_VLWeightAdaptor(MegatronVLLMWeightAdaptor): super(Qwen2_5_VLWeightAdaptor, self).__init__(model_config) self.params_mapping = [ ("text_decoder.embedding.word_embeddings", "language_model.model.embed_tokens"), - ("text_decoder.decoder.layers.*.self_attention.linear_qkv", "language_model.model.layers.*.self_attn.qkv_proj"), - ("text_decoder.decoder.layers.*.self_attention.linear_proj", "language_model.model.layers.*.self_attn.o_proj"), - ("text_decoder.decoder.layers.*.input_layernorm", "language_model.model.layers.*.input_layernorm"), - ("text_decoder.decoder.layers.*.pre_mlp_layernorm", "language_model.model.layers.*.post_attention_layernorm"), - ("text_decoder.decoder.layers.*.mlp.linear_fc1", "language_model.model.layers.*.mlp.gate_up_proj"), - ("text_decoder.decoder.layers.*.mlp.linear_fc2", "language_model.model.layers.*.mlp.down_proj"), + ("text_decoder.decoder.layers.{layer_num}.self_attention.linear_qkv", "language_model.model.layers.{layer_num}.self_attn.qkv_proj"), + ("text_decoder.decoder.layers.{layer_num}.self_attention.linear_proj", "language_model.model.layers.{layer_num}.self_attn.o_proj"), + ("text_decoder.decoder.layers.{layer_num}.input_layernorm", "language_model.model.layers.{layer_num}.input_layernorm"), + ("text_decoder.decoder.layers.{layer_num}.pre_mlp_layernorm", "language_model.model.layers.{layer_num}.post_attention_layernorm"), + ("text_decoder.decoder.layers.{layer_num}.mlp.linear_fc1", "language_model.model.layers.{layer_num}.mlp.gate_up_proj"), + ("text_decoder.decoder.layers.{layer_num}.mlp.linear_fc2", "language_model.model.layers.{layer_num}.mlp.down_proj"), ("text_decoder.decoder.final_layernorm", "language_model.model.norm"), ("text_decoder.output_layer", "language_model.lm_head"), ("image_encoder.encoder.patch_embed.proj", "visual.patch_embed.proj"), - ("image_encoder.encoder.blocks.layers.*.self_attention.linear_qkv", "visual.blocks.*.attn.qkv"), - ("image_encoder.encoder.blocks.layers.*.self_attention.linear_proj", "visual.blocks.*.attn.proj"), - ("image_encoder.encoder.blocks.layers.*.input_layernorm", "visual.blocks.*.norm1"), - ("image_encoder.encoder.blocks.layers.*.pre_mlp_layernorm", "visual.blocks.*.norm2"), - ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc1", "visual.blocks.*.mlp.gate_up_proj"), - ("image_encoder.encoder.blocks.layers.*.mlp.linear_fc2", "visual.blocks.*.mlp.down_proj"), + ("image_encoder.encoder.blocks.layers.{layer_num}.self_attention.linear_qkv", "visual.blocks.{layer_num}.attn.qkv"), + ("image_encoder.encoder.blocks.layers.{layer_num}.self_attention.linear_proj", "visual.blocks.{layer_num}.attn.proj"), + ("image_encoder.encoder.blocks.layers.{layer_num}.input_layernorm", "visual.blocks.{layer_num}.norm1"), + ("image_encoder.encoder.blocks.layers.{layer_num}.pre_mlp_layernorm", "visual.blocks.{layer_num}.norm2"), + ("image_encoder.encoder.blocks.layers.{layer_num}.mlp.linear_fc1", "visual.blocks.{layer_num}.mlp.gate_up_proj"), + ("image_encoder.encoder.blocks.layers.{layer_num}.mlp.linear_fc2", "visual.blocks.{layer_num}.mlp.down_proj"), ("image_encoder.projector.layernorm", "visual.merger.ln_q"), ("image_encoder.projector.encoder.linear_fc1", "visual.merger.mlp.0"), ("image_encoder.projector.encoder.linear_fc2", "visual.merger.mlp.2"), @@ -272,16 +272,16 @@ class Qwen2_5_VLWeightAdaptor(MegatronVLLMWeightAdaptor): base_name = inference_name for megatron_pattern, vllm_pattern in self.params_mapping: - vllm_regex = vllm_pattern.replace("*", "(\\d+)") - match = re.match(f"^{vllm_regex}(.*)?$", base_name) + vllm_regex = vllm_pattern.replace("{layer_num}", r"(\d+)") + match = re.match(f"^{vllm_regex}(.*)$", base_name) if match: - digits = match.groups() - extra_suffix = match.groups()[-1] if match.groups() and match.groups()[-1] is not None else "" + groups = match.groups() + layer_nums = [g for g in groups[:-1] if g is not None and g.isdigit()] + extra_suffix = groups[-1] if groups and groups[-1] is not None else "" megatron_result = megatron_pattern - for i, digit in enumerate(digits[:-1] if extra_suffix else digits): - if digit is not None: - megatron_result = megatron_result.replace("*", digit, 1) + for layer_num in layer_nums: + megatron_result = megatron_result.replace("{layer_num}", layer_num, 1) return megatron_result + extra_suffix + weight_suffix @@ -434,8 +434,8 @@ class Qwen2_5_VLWeightAdaptor(MegatronVLLMWeightAdaptor): """ Find the first pipeline parallel rank that has non-zero layers. """ - for i in range(len(pp_layers)): - if pp_layers[i] > 0: + for i, layers in enumerate(pp_layers): + if layers > 0: return i return -1 -- Gitee From 28293737c9f133c292135e234cc90d9f248b6bb5 Mon Sep 17 00:00:00 2001 From: mr-lin314 <798948055@qq.com> Date: Mon, 9 Jun 2025 14:37:41 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=86=B2=E7=AA=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../rollout/vllm_adapter/megatron_weight_loaders.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py b/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py index fcb10eab..5ace26b0 100644 --- a/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py +++ b/mindspeed_rl/models/rollout/vllm_adapter/megatron_weight_loaders.py @@ -223,15 +223,3 @@ MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY = { "CustomDeepseekV3ForCausalLM": deepseek_megatron_weight_loader, "Qwen2_5_VLForConditionalGeneration": qwen_vl_megatron_weight_loader } - - -LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY = { - ColumnParallelLinear: parallel_weight_loader, - MergedColumnParallelLinear: parallel_weight_loader, - QKVParallelLinear: parallel_weight_loader, - RowParallelLinear: parallel_weight_loader, - VocabParallelEmbedding: parallel_weight_loader, - ParallelLMHead: parallel_weight_loader, - ReplicatedLinear: parallel_weight_loader, - FusedMoE: parallel_weight_loader -} -- Gitee