diff --git a/0001-suit-vllm-0.9.1-qwen3-omni.patch b/0001-suit-vllm-0.9.1-qwen3-omni.patch new file mode 100644 index 0000000000000000000000000000000000000000..636eef474d906e836e8e648bbcedf5ea04b48b3f --- /dev/null +++ b/0001-suit-vllm-0.9.1-qwen3-omni.patch @@ -0,0 +1,57 @@ +From 22ea5d67f8f1ff13e27d4ea3fde3bc93b40b5327 Mon Sep 17 00:00:00 2001 +From: lvhaoyu +Date: Sat, 25 Oct 2025 10:55:46 +0800 +Subject: [PATCH] suit vllm 0.9.1 qwen3-omni + +--- + vllm/entrypoints/chat_utils.py | 6 ++++++ + vllm/transformers_utils/configs/ovis.py | 2 +- + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py +index 95c806c22..b54649d5a 100644 +--- a/vllm/entrypoints/chat_utils.py ++++ b/vllm/entrypoints/chat_utils.py +@@ -543,6 +543,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): + return "" + if model_type == "kimi_vl": + return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501 ++ if model_type == "qwen3_omni_moe": ++ return "<|vision_start|><|image_pad|><|vision_end|>" + + raise TypeError(f"Unknown {modality} model type: {model_type}") + elif modality == "audio": +@@ -555,6 +557,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): + f"<|audio_bos|><|AUDIO|><|audio_eos|>") + if model_type == "minicpmo": + return "()" ++ if model_type == "qwen3_omni_moe": ++ return "<|audio_start|><|audio_pad|><|audio_end|>" + raise TypeError(f"Unknown model type: {model_type}") + elif modality == "video": + if model_type == "internvl_chat": +@@ -568,6 +572,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): + if model_type.startswith("llava"): + return self._cached_token_str(self._tokenizer, + hf_config.video_token_index) ++ if model_type == "qwen3_omni_moe": ++ return "<|vision_start|><|video_pad|><|vision_end|>" + raise TypeError(f"Unknown {modality} model type: {model_type}") + else: + raise TypeError(f"Unknown modality: {modality}") +diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py +index c2728f0ed..db6050fac 100644 +--- a/vllm/transformers_utils/configs/ovis.py ++++ b/vllm/transformers_utils/configs/ovis.py +@@ -73,7 +73,7 @@ IMAGE_TOKEN = "" + IMAGE_ATOM_ID = -300 + IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] + +-AutoConfig.register("aimv2", AIMv2Config) ++# AutoConfig.register("aimv2", AIMv2Config) + + + # ---------------------------------------------------------------------- +-- +2.25.1 + diff --git a/transformer_4_57_1.patch b/transformer_4_57_1.patch new file mode 100644 index 0000000000000000000000000000000000000000..e4fb96c34200d388af3e063402754fe304250f44 --- /dev/null +++ b/transformer_4_57_1.patch @@ -0,0 +1,65 @@ +diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py +index 4fb3d32213..e41c6e4b53 100644 +--- a/src/transformers/generation/__init__.py ++++ b/src/transformers/generation/__init__.py +@@ -30,8 +30,9 @@ _import_structure = { + } + + try: +- if not is_torch_available(): +- raise OptionalDependencyNotAvailable() ++ # if not is_torch_available(): ++ # raise OptionalDependencyNotAvailable() ++ ... + except OptionalDependencyNotAvailable: + pass + else: +diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py +index 4b71712dfc..2c9cec58e3 100644 +--- a/src/transformers/models/auto/image_processing_auto.py ++++ b/src/transformers/models/auto/image_processing_auto.py +@@ -49,7 +49,8 @@ from .configuration_auto import ( + logger = logging.get_logger(__name__) + + +-FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"] ++# FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"] ++FORCE_FAST_IMAGE_PROCESSOR = [] + + + if TYPE_CHECKING: +diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py +index 8b40b6535f..a892aaf0e2 100644 +--- a/src/transformers/processing_utils.py ++++ b/src/transformers/processing_utils.py +@@ -531,7 +531,7 @@ class ProcessorMixin(PushToHubMixin): + + # Check each arg is of the proper class (this will also catch a user initializing in the wrong order) + for attribute_name, arg in kwargs.items(): +- self.check_argument_for_proper_class(attribute_name, arg) ++ # self.check_argument_for_proper_class(attribute_name, arg) + setattr(self, attribute_name, arg) + + def __call__( +@@ -1430,6 +1430,8 @@ class ProcessorMixin(PushToHubMixin): + args = [] + for attribute_name in cls.attributes: + class_name = getattr(cls, f"{attribute_name}_class") ++ if class_name == "AutoVideoProcessor": ++ class_name = "AutoImageProcessor" + if isinstance(class_name, tuple): + classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name) + if attribute_name == "image_processor": +diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py +index 2bf0464dbf..1e709c77dd 100644 +--- a/src/transformers/utils/import_utils.py ++++ b/src/transformers/utils/import_utils.py +@@ -2154,7 +2154,7 @@ class DummyObject(type): + def __getattribute__(cls, key): + if (key.startswith("_") and key != "_from_config") or key == "is_dummy" or key == "mro" or key == "call": + return super().__getattribute__(key) +- requires_backends(cls, cls._backends) ++ # requires_backends(cls, cls._backends) + + + def is_torch_fx_proxy(x): diff --git a/vllm_mindspore/model_executor/layers/rotary_embedding.py b/vllm_mindspore/model_executor/layers/rotary_embedding.py index 230cbeee34ae6679293bbd1ba883df427f737ec2..4519442de330a4676f6ca28d2da906a5a7e356b7 100644 --- a/vllm_mindspore/model_executor/layers/rotary_embedding.py +++ b/vllm_mindspore/model_executor/layers/rotary_embedding.py @@ -29,6 +29,7 @@ from typing import Any, Optional, Union import mindspore import numpy as np +import mindspore as ms from mindspore import Tensor, mint, nn, ops from mindspore.common import dtype as mstype from mindspore.ops.auto_generate.gen_ops_prim import SliceExt @@ -36,7 +37,27 @@ from transformers import PretrainedConfig from vllm.config import get_current_vllm_config from vllm_mindspore.model_executor.utils import get_model_context - +from vllm_mindspore.model_executor.models.vision import ( + get_llm_pos_ids_for_vision +) + +def _get_feat_extract_output_lengths(input_lengths: ms.Tensor): + input_lengths_leave = input_lengths % 100 + feat_lengths = (input_lengths_leave - 1) // 2 + 1 + output_lengths = ( + ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 + ) + return feat_lengths, output_lengths + +def apply_interleaved_rope(x: Tensor, mrope_section: list[int]) -> Tensor: + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + """ + x_t = x[0].clone() + x_t[..., 1 : mrope_section[1] * 3 : 3] = x[1, ..., 1 : mrope_section[1] * 3 : 3] + x_t[..., 2 : mrope_section[2] * 3 : 3] = x[2, ..., 2 : mrope_section[2] * 3 : 3] + return x_t def _apply_rotary_emb( x: Tensor, @@ -270,6 +291,7 @@ class MRotaryEmbedding(RotaryEmbedding): is_neox_style: bool, dtype: mindspore.Type, mrope_section: Optional[list[int]] = None, + mrope_interleaved: bool = False, ) -> None: # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get @@ -282,6 +304,8 @@ class MRotaryEmbedding(RotaryEmbedding): if self.mrope_section: assert sum(self.mrope_section) == rotary_dim // 2 + self.mrope_interleaved = mrope_interleaved + def construct( self, positions: mindspore.Tensor, @@ -308,14 +332,18 @@ class MRotaryEmbedding(RotaryEmbedding): cos_sin = self.cos_sin_cache[positions] cos, sin = ops.chunk(cos_sin, 2, axis=-1) if positions.ndim == 2: - cos_l = mint.split(cos, self.mrope_section, dim=-1) - sin_l = mint.split(sin, self.mrope_section, dim=-1) - cos, sin = (), () - for i in range(len(self.mrope_section)): # type: ignore[arg-type] - cos += (cos_l[i][i], ) - sin += (sin_l[i][i], ) - cos = mint.cat(cos, dim=-1) - sin = mint.cat(sin, dim=-1) + if self.mrope_interleaved: + cos = apply_interleaved_rope(cos, self.mrope_section) + sin = apply_interleaved_rope(sin, self.mrope_section) + else: + cos_l = mint.split(cos, self.mrope_section, dim=-1) + sin_l = mint.split(sin, self.mrope_section, dim=-1) + cos, sin = (), () + for i in range(len(self.mrope_section)): # type: ignore[arg-type] + cos += (cos_l[i][i], ) + sin += (sin_l[i][i], ) + cos = mint.cat(cos, dim=-1) + sin = mint.cat(sin, dim=-1) query_shape = query.shape query = query.view(num_tokens, -1, self.head_size) @@ -369,6 +397,21 @@ class MRotaryEmbedding(RotaryEmbedding): seq_len: Optional[int] = None, ) -> tuple[mindspore.Tensor, int]: """Get mrope input positions and delta value.""" + from vllm.transformers_utils.config import thinker_uses_mrope + if thinker_uses_mrope(hf_config): + return cls._qwen3_omni_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=context_len, + seq_len=seq_len, + # audio_feature_lengths=audio_feature_lengths, + # use_audio_in_video=use_audio_in_video, + audio_feature_lengths=None, + use_audio_in_video=False, + ) return cls._vl_get_input_positions_tensor( input_tokens=input_tokens, hf_config=hf_config, @@ -379,6 +422,314 @@ class MRotaryEmbedding(RotaryEmbedding): seq_len=seq_len, ) + @classmethod + def _qwen3_omni_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: list[list[int]] | Tensor | None, + video_grid_thw: list[list[int]] | Tensor | None, + second_per_grid_ts: list[float] | None = None, + context_len: int = 0, + seq_len: int | None = None, + audio_feature_lengths: Tensor | None = None, + use_audio_in_video: bool = False, + ) -> tuple[Tensor, int]: + config = hf_config.thinker_config + if isinstance(image_grid_thw, list): + image_grid_thw = Tensor(image_grid_thw) + if isinstance(video_grid_thw, list): + video_grid_thw = Tensor(video_grid_thw) + input_ids = Tensor(input_tokens) + if input_ids is None or input_ids.ndim != 1: + raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids") + + seq_len = input_ids.shape[0] + if audio_feature_lengths is not None and not isinstance( + audio_feature_lengths, Tensor + ): + audio_feature_lengths = Tensor( + audio_feature_lengths, dtype=ms.int64 + ) + if second_per_grid_ts is None: + if video_grid_thw is not None and video_grid_thw.numel() > 0: + second_per_grids = mint.ones( + video_grid_thw.shape[0], dtype=ms.float32 + ) + else: + second_per_grids = Tensor([], dtype=ms.float32) + else: + second_per_grids = Tensor(second_per_grid_ts, dtype=ms.float32) + + spatial_merge_size = config.vision_config.spatial_merge_size + image_token_id = config.image_token_id + video_token_id = config.video_token_id + audio_token_id = config.audio_token_id + vision_start_token_id = config.vision_start_token_id + audio_start_token_id = config.audio_start_token_id + position_id_per_seconds = config.position_id_per_seconds + + vision_start_indices = ops.argwhere( + input_ids == vision_start_token_id + ).squeeze(1) + if vision_start_indices.numel() > 0: + vision_tokens = input_ids[vision_start_indices + 1] + else: + vision_tokens = mint.empty((0,), dtype=input_ids.dtype) + audio_nums = mint.sum(input_ids == audio_start_token_id) + image_nums = (vision_tokens == image_token_id).sum() + video_nums = ( + (vision_tokens == audio_start_token_id).sum() + if use_audio_in_video + else (vision_tokens == video_token_id).sum() + ) + + llm_pos_ids_list: list[Tensor] = [] + st = 0 + image_idx = 0 + video_idx = 0 + audio_idx = 0 + remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums # noqa: E501 + multimodal_nums = ( + image_nums + audio_nums + if use_audio_in_video + else image_nums + video_nums + audio_nums + ) # noqa: E501 + + for _ in range(multimodal_nums): + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + if (image_token_id in input_tokens or video_token_id in input_tokens) and ( + remain_videos > 0 or remain_images > 0 + ): + ed_vision_start = input_tokens.index(vision_start_token_id, st) + else: + ed_vision_start = len(input_tokens) + 1 + if audio_token_id in input_tokens and remain_audios > 0: + ed_audio_start = input_tokens.index(audio_start_token_id, st) + else: + ed_audio_start = len(input_tokens) + 1 + min_ed = min(ed_vision_start, ed_audio_start) + + if min_ed == ed_audio_start: + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + mint.arange(text_len, dtype=ms.int64) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + mint.arange(bos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + _, audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + llm_pos_ids = ( + mint.arange(audio_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + mint.arange(eos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + audio_len + eos_len + audio_idx += 1 + remain_audios -= 1 + elif ( + min_ed == ed_vision_start + and input_ids[ed_vision_start + 1] == image_token_id + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + mint.arange(text_len, dtype=ms.int64) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + mint.arange(bos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + grid_t = image_grid_thw[image_idx][0] + grid_hs = image_grid_thw[:, 1] + grid_ws = image_grid_thw[:, 2] + t_index = mint.arange(grid_t.item()) * position_id_per_seconds + llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + mint.arange(eos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + image_len + eos_len + image_idx += 1 + remain_images -= 1 + elif ( + min_ed == ed_vision_start + and input_ids[ed_vision_start + 1] == video_token_id + and not use_audio_in_video + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + mint.arange(text_len, dtype=ms.int64) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + mint.arange(bos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = ( + mint.arange(grid_t.item()) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) + llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + mint.arange(eos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + video_len + eos_len + video_idx += 1 + remain_videos -= 1 + elif ( + min_ed == ed_vision_start + and ed_vision_start + 1 == ed_audio_start + and use_audio_in_video + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + mint.arange(text_len, dtype=ms.int64) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + bos_block = ( + mint.arange(bos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(bos_block) + llm_pos_ids_list.append(bos_block) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + _, audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + audio_llm_pos_ids = ( + mint.arange(audio_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = ( + mint.arange(grid_t.item()) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) + video_llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + video_data_index, audio_data_index = 0, 0 + while ( + video_data_index < video_llm_pos_ids.shape[-1] + and audio_data_index < audio_llm_pos_ids.shape[-1] + ): + if ( + video_llm_pos_ids[0][video_data_index] + <= audio_llm_pos_ids[0][audio_data_index] + ): + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_data_index + 1 + ] + ) + video_data_index += 1 + else: + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_data_index + 1 + ] + ) + audio_data_index += 1 + if video_data_index < video_llm_pos_ids.shape[-1]: + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_llm_pos_ids.shape[-1] + ] + ) + if audio_data_index < audio_llm_pos_ids.shape[-1]: + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_llm_pos_ids.shape[-1] + ] + ) + video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + eos_block = ( + mint.arange(eos_len, dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(eos_block) + llm_pos_ids_list.append(eos_block) + st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2 # noqa: E501 + audio_idx += 1 + video_idx += 1 + remain_videos -= 1 + remain_audios -= 1 + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + mint.arange(text_len.item(), dtype=ms.int64).view(1, -1).expand(3, -1) + + st_idx + ) + + llm_positions = mint.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + if llm_positions.shape[1] != seq_len: + raise RuntimeError("Position ids length mismatch with input ids length") + + mrope_position_delta = llm_positions.max() + 1 - seq_len + return llm_positions, mrope_position_delta.item() + @classmethod def _vl_get_input_positions_tensor( cls, @@ -801,26 +1152,38 @@ def get_rope( original_max_position) elif scaling_type == "default": if "mrope_section" in rope_scaling: - if is_neox_style: - rotary_emb = InferMRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - mrope_section=rope_scaling["mrope_section"], - ) - else: - rotary_emb = MRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - mrope_section=rope_scaling["mrope_section"], - ) + # if is_neox_style: + # rotary_emb = InferMRotaryEmbedding( + # head_size, + # rotary_dim, + # max_position, + # base, + # is_neox_style, + # dtype, + # mrope_section=rope_scaling["mrope_section"], + # mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + # ) + # else: + # rotary_emb = MRotaryEmbedding( + # head_size, + # rotary_dim, + # max_position, + # base, + # is_neox_style, + # dtype, + # mrope_section=rope_scaling["mrope_section"], + # mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + # ) + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_scaling["mrope_section"], + mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + ) else: raise NotImplementedError elif scaling_type == "yarn": diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 4c0d729e0c27e0b3e217b87f81c4b343ac2b89dc..d39cb0a662121dfd7081a56637af8616e703acaf 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -429,13 +429,13 @@ class NativeModel(MsModelBase): dtype=self.model_config.dtype, max_model_len=self.model_config.max_model_len) self.kv_caches = [ - AttentionWrapper() for i in range(self.config.num_hidden_layers) + AttentionWrapper() for i in range(self.config.text_config.num_hidden_layers) ] compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") - for i in range(self.config.num_hidden_layers): + for i in range(self.config.text_config.num_hidden_layers): compilation_config.static_forward_context[str( i)] = self.kv_caches[i] @@ -616,7 +616,7 @@ class NativeModel(MsModelBase): model_inputs, is_prefill = self.prepare_inputs(input_ids, positions, intermediate_tensors, inputs_embeds) - + model_inputs.update(kwargs) # for dummy_attention_metadata if is_prefill and not self.has_prefill_warmup: self.has_prefill_warmup = True diff --git a/vllm_mindspore/model_executor/models/qwen2_5_omni_thinker.py b/vllm_mindspore/model_executor/models/qwen2_5_omni_thinker.py index b86be1a4646640fec781669b118df46baa133dd7..776c19d5cb49019ea29523d50e4c794b0d12d64d 100644 --- a/vllm_mindspore/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm_mindspore/model_executor/models/qwen2_5_omni_thinker.py @@ -28,6 +28,7 @@ from typing import Annotated, Any, Literal, Optional, Union from functools import partial from mindspore import Tensor, mint +import mindspore as ms from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import ( Qwen2_5OmniConfig, @@ -109,7 +110,8 @@ class Qwen2_5OmniThinkerProcessingInfo( def get_hf_processor(self, **kwargs: object) -> Qwen2_5OmniProcessor: return self.ctx.get_hf_processor( Qwen2_5OmniProcessor, - use_fast=kwargs.pop("use_fast", True), + # use_fast=kwargs.pop("use_fast", True), + use_fast=False, **kwargs, ) @@ -444,6 +446,8 @@ class Qwen2_5OmniThinkerMultiModalProcessor( if ('input_audio_features' not in hf_inputs and input_features is not None): if feature_attention_mask is not None: + input_features = ms.from_numpy(input_features) + feature_attention_mask = ms.from_numpy(feature_attention_mask) input_features = input_features.permute( 0, 2, 1)[feature_attention_mask.bool()].permute(1, 0) hf_inputs['input_audio_features'] = input_features diff --git a/vllm_mindspore/model_executor/models/qwen2_5_vl.py b/vllm_mindspore/model_executor/models/qwen2_5_vl.py index 12589a9daa3b205955b689da11e749cb343a5f2a..e2d5030b0641c6618e42ead8773f6c7296953c49 100644 --- a/vllm_mindspore/model_executor/models/qwen2_5_vl.py +++ b/vllm_mindspore/model_executor/models/qwen2_5_vl.py @@ -204,7 +204,8 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): min_pixels=min_pixels, max_pixels=max_pixels, size=size, - use_fast=kwargs.get("use_fast")), + # use_fast=kwargs.get("use_fast")), + use_fast=False) **kwargs, ) @@ -428,7 +429,8 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): min_pixels=min_pixels, max_pixels=max_pixels, size=size, - use_fast=kwargs.get("use_fast")), + # use_fast=kwargs.get("use_fast")), + use_fast=False) **kwargs, ) diff --git a/vllm_mindspore/model_executor/models/qwen3_omni_moe_thinker.py b/vllm_mindspore/model_executor/models/qwen3_omni_moe_thinker.py index 6684fd4b4fc93dddb036346abbfce6c00e0719fb..e09f2ad3b4ed7e7ad6a312b5efdc9e6992dccada 100644 --- a/vllm_mindspore/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm_mindspore/model_executor/models/qwen3_omni_moe_thinker.py @@ -24,12 +24,13 @@ from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial -from typing import Any +from typing import Any, Optional import math import numpy as np # import torch -from mindspore import Parameter, Tensor, mint, nn +from mindspore import Parameter, Tensor, mint, nn, mutable +from mindspore.common import dtype as mstype import mindspore.mint.nn.functional as F import mindspore as ms from mindspore import ops @@ -44,9 +45,9 @@ from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import ( Qwen3OmniMoeConfig, Qwen3OmniMoeThinkerConfig, ) -from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( - Qwen3OmniMoeAudioEncoder, -) +# from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( +# Qwen3OmniMoeAudioEncoder, +# ) from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import ( Qwen3OmniMoeProcessor, ) @@ -91,7 +92,7 @@ from vllm_mindspore.model_executor.models.interfaces import ( SupportsMultiModal, ) from vllm_mindspore.model_executor.models.utils import (WeightsMapper, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix, _merge_multimodal_embeddings) from vllm_mindspore.model_executor.models.model_base import NativeModel, \ AttentionWrapper from vllm_mindspore.model_executor.models.attention_mask import \ @@ -120,6 +121,9 @@ from vllm_mindspore.model_executor.models.qwen2_5_omni_thinker import ( Qwen2_5OmniThinkerProcessingInfo, ) +from vllm_mindspore.model_executor.layers.rotary_embedding import _apply_rotary_emb +from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE + from mindspore.common.api import _pynative_executor _pynative_executor.set_enable_grad(False) @@ -159,16 +163,22 @@ class Qwen3_VisionAttention(Qwen2_5_VisionAttention): self.num_attention_heads_per_partition * self.head_dim), -1) # q/k reshape to BSND - q = q.reshape(1, seq_length, self.num_attention_heads_per_partition, + # q = q.reshape(1, seq_length, self.num_attention_heads_per_partition, + # self.hidden_size_per_attention_head) + # k = k.reshape(1, seq_length, self.num_attention_heads_per_partition, + # self.hidden_size_per_attention_head) + q = q.reshape(seq_length, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) - k = k.reshape(1, seq_length, self.num_attention_heads_per_partition, + k = k.reshape(seq_length, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) cos, sin = position_embeddings origin_dtype = q.dtype - q, k = ms_custom_ops.apply_rotary_pos_emb_ext(q.astype(ms.float32), - k.astype(ms.float32), - cos, sin, "BSND", "half") + # q, k = ms_custom_ops.apply_rotary_pos_emb_ext(q.astype(ms.float32), + # k.astype(ms.float32), + # cos, sin, "BSND", "half") + q = _apply_rotary_emb(q, cos, sin, True) + k = _apply_rotary_emb(k, cos, sin, True) # q/k reshape to TH q = q.astype(origin_dtype) @@ -465,8 +475,9 @@ class Qwen3Omni_VisionTransformer(nn.Cell): # hidden_states = hidden_states.unsqueeze(1) seq_len, _ = x.shape rotary_pos_emb = rotary_pos_emb.astype(hidden_states.dtype) - rotary_pos_emb = rotary_pos_emb.reshape(1, seq_len, 1, -1) - emb = mint.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + # rotary_pos_emb = rotary_pos_emb.reshape(1, seq_len, 1, -1) + emb = rotary_pos_emb + # emb = mint.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) position_embeddings = (mint.cos(emb), mint.sin(emb)) hidden_states_list = [] @@ -522,17 +533,17 @@ class Qwen3Omni_VisionTransformer(nn.Cell): weight_loader(param, loaded_weight, shard_id) break else: - if name in params_dict: - param = params_dict[name] - if "patch_embed.proj.weight" in name: - loaded_weight = loaded_weight[:] - loaded_weight = loaded_weight.reshape(loaded_weight.shape[0], - -1) - param.set_data(ms.Tensor(loaded_weight, dtype=param.dtype)) - else: - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) + param = params_dict[name] + if "patch_embed.proj.weight" in name: + loaded_weight = loaded_weight[:] + loaded_weight = loaded_weight.reshape(loaded_weight.shape[0], + -1) + param.set_data(ms.Tensor(loaded_weight, dtype=param.dtype)) + else: + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) return loaded_params def set_model_inputs(self): @@ -560,11 +571,18 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): def construct( self, - input_ids: ms.Tensor, - positions: ms.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: ms.Tensor | None = None, - deepstack_input_embeds: IntermediateTensors | None = None, + input_ids: Tensor, + positions: Tensor, + key_caches: list[Tensor], + value_caches: list[Tensor], + slot_mapping: Tensor, + attn_mask: Tensor, + batch_valid_length: Tensor, + q_seq_lens: Tensor, + block_tables: Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[Tensor] = None, + deepstack_input_embeds: Optional[Mapping[str, Tensor]] = None, ) -> ms.Tensor | IntermediateTensors: if get_pp_group().is_first_rank: if inputs_embeds is not None: @@ -576,23 +594,30 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer_idx, layer in enumerate( - self.layers[self.start_layer : self.end_layer] - ): - layer_idx = layer_idx + self.start_layer - + # for layer_idx, layer in enumerate( + # self.layers[self.start_layer : self.end_layer] + # ): + # layer_idx = layer_idx + self.start_layer + + # hidden_states, residual = layer( + # positions, + # hidden_states, + # residual, + # ) + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] hidden_states, residual = layer( - positions, - hidden_states, - residual, - ) + positions, hidden_states, key_caches[i - self.start_layer], + value_caches[i - self.start_layer], slot_mapping, attn_mask, + batch_valid_length, q_seq_lens, block_tables, residual, + None, None, None, None) - if deepstack_input_embeds is not None and layer_idx in range( + if deepstack_input_embeds is not None and i in range( 0, len(deepstack_input_embeds) ): hidden_states = ( hidden_states - + deepstack_input_embeds[f"deepstack_input_embeds_{layer_idx}"] + + deepstack_input_embeds[f"deepstack_input_embeds_{i}"] ) if not get_pp_group().is_last_rank: @@ -605,7 +630,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super(Qwen3MoeForCausalLM, self).__init__() + super(Qwen3MoeForCausalLM, self).__init__(vllm_config=vllm_config) config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config @@ -633,7 +658,8 @@ class Qwen3OmniMoeThinkerProcessingInfo( def get_hf_processor(self, **kwargs: object) -> Qwen3OmniMoeProcessor: processor = self.ctx.get_hf_processor( Qwen3OmniMoeProcessor, - use_fast=kwargs.pop("use_fast", True), + # use_fast=kwargs.pop("use_fast", True), + use_fast=False, **kwargs, ) if not hasattr(processor, "audio_token"): @@ -851,7 +877,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( mm_item_counts, ) else: - prompt_ids, mm_placeholders = self._apply_prompt_updates( + prompt_ids, prompt, mm_placeholders = self._apply_prompt_updates( prompt_ids, mm_prompt_updates, mm_item_counts @@ -930,9 +956,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( image_token_id = vocab[image_token] video_token_id = vocab[video_token] - out_mm_data = out_mm_kwargs.get_data() - audio_feature_lengths = out_mm_data.get("audio_feature_lengths") - feature_attention_mask = out_mm_data.get("feature_attention_mask") + audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths") + feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if audio_feature_lengths is None and feature_attention_mask is None: audio_output_lengths = [] elif audio_feature_lengths is not None: @@ -969,7 +994,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( return [audio_token_id] * num_features def get_replacement_qwen2_vision(item_idx: int, modality: str): - grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx] + grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, ms.Tensor) merge_length = image_processor.merge_size**2 @@ -982,7 +1007,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( def get_replacement_qwen2_use_audio_in_video(item_idx: int): nonlocal audio_in_video_item_idx audio_num_features = audio_output_lengths[audio_item_idx + item_idx] - video_grid_thw = out_mm_data["video_grid_thw"][item_idx] + video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] audio_in_video_item_idx += 1 @@ -1207,7 +1232,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ) self.visual.set_model_inputs() self.visual.construct = ms.jit(function=self.visual, jit_level='O0') - + self.vision_config = thinker_config.vision_config self.quant_config = quant_config self.language_model = Qwen3MoeLLMForCausalLM( @@ -1217,6 +1242,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( prefix=maybe_prefix(prefix, "language_model"), ) self.model = self.language_model.model + self.text_config = thinker_config.text_config self.lm_head = self.language_model.lm_head self.common_preprocess(vllm_config, prefix) @@ -1236,8 +1262,9 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( self.deepstack_input_embeds = ( [ mint.zeros( - vllm_config.scheduler_config.max_num_batched_tokens, - thinker_config.text_config.hidden_size, + (vllm_config.scheduler_config.max_num_batched_tokens, + thinker_config.text_config.hidden_size), + dtype=self.model_config.dtype ) for _ in range(self.deepstack_num_level) ] @@ -1246,10 +1273,16 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ) self.visual_dim = thinker_config.vision_config.out_hidden_size self.multiscale_dim = self.visual_dim * self.deepstack_num_level + head_dim = (self.vision_config.hidden_size // + self.vision_config.num_heads) + self.rotary_pos_emb_full = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + # self.num_grid_per_side = (self.vision_config.image_size // + # self.vision_config.patch_size) + # self.spatial_merge_size = self.vision_config.spatial_merge_size def common_preprocess(self, vllm_config, prefix=""): self.set_modules({ - "thinker.visual.model": self.visual, + "thinker.visual": self.visual, "thinker.model": self.language_model.model, "thinker.lm_head": self.language_model.lm_head }) @@ -1257,13 +1290,13 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( dtype=self.model_config.dtype, max_model_len=self.model_config.max_model_len) self.kv_caches = [ - AttentionWrapper() for i in range(self.config.num_hidden_layers) + AttentionWrapper() for i in range(self.text_config.num_hidden_layers) ] compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") - for i in range(self.config.num_hidden_layers): + for i in range(self.text_config.num_hidden_layers): compilation_config.static_forward_context[str( i)] = self.kv_caches[i] @@ -1284,8 +1317,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( if num_tokens > self.deepstack_input_embeds[0].shape[0]: self.deepstack_input_embeds = [ mint.zeros( - num_tokens, - self.config.text_config.hidden_size, + (num_tokens, + self.config.text_config.hidden_size), dtype=self.deepstack_input_embeds[0].dtype, ) for _ in range(self.deepstack_num_level) @@ -1361,20 +1394,26 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( self, input_ids: ms.Tensor, multimodal_embeddings: MultiModalEmbeddings | None = None, - *, - is_multimodal: ms.Tensor | None = None, - handle_oov_mm_token: bool = False, + # *, + # is_multimodal: ms.Tensor | None = None, + # handle_oov_mm_token: bool = False, ) -> ms.Tensor: - inputs_embeds = self._get_text_embeddings( - input_ids, - self.language_model.get_input_embeddings, - is_multimodal=is_multimodal, - handle_oov_mm_token=handle_oov_mm_token, - ) + # inputs_embeds = self._get_text_embeddings( + # input_ids, + # self.language_model.get_input_embeddings, + # is_multimodal=is_multimodal, + # handle_oov_mm_token=handle_oov_mm_token, + # ) + inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds + placeholder_token_id = [self.config.image_token_id, + self.config.video_token_id, + self.config.audio_token_id] + is_multimodal = ms.numpy.isin(input_ids, placeholder_token_id) + deepstack_input_embeds = None # TODO (ywang96): support overlapping modalitiy embeddings so that # `use_audio_in_video` will work on V1. @@ -1414,10 +1453,9 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( mm_position_idx += num_tokens - deepstack_input_embeds = mint.zeros_like( - inputs_embeds.view(inputs_embeds.shape[0], multiscale_len * inputs_embeds.shape[1]) - ) - deepstack_input_embeds = merge_multimodal_embeddings( + deepstack_input_embeds = inputs_embeds.new_zeros( + inputs_embeds.shape[0], multiscale_len * inputs_embeds.shape[1]) + deepstack_input_embeds = _merge_multimodal_embeddings( inputs_embeds=deepstack_input_embeds, multimodal_embeddings=multimodal_embeddings_multiscale, is_multimodal=is_vision, @@ -1430,7 +1468,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ) self._set_deepstack_input_embeds(deepstack_input_embeds) - inputs_embeds = merge_multimodal_embeddings( + inputs_embeds = _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, @@ -1484,18 +1522,26 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( def load_weights(self, weights: Iterable[tuple[str, ms.Tensor]]) -> set[str]: params_dict = self.get_params_dict() + loaded_param = set() + visual_load = set() + text_load = set() for name, weight in weights: - if "visual." in name: - self.visual.load_weights([(name, weight)], params_dict) - elif "language_model." in name: - self.language_model.load_weights([(name, weight)], params_dict) + if "thinker.visual." in name: + visual_load.update( + self.visual.load_weights([(name, weight)], params_dict)) + elif "thinker.model." in name: + text_load.update( + self.model.load_weights([(name, weight)], params_dict)) else: # Handle other weights if name in params_dict: param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, weight) - return set() + loaded_param.add(name) + loaded_param.update(visual_load) + loaded_param.update(text_load) + return None #loaded_param talker not supported yet # def get_mrope_input_positions( # self, @@ -1828,14 +1874,14 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( mint.tile(mint.stack([hpos_ids, wpos_ids], dim=-1), (t, 1))) pos_ids = mint.cat(pos_ids, dim=0) max_grid_size = int(grid_thw[:, 1:].max().item()) - rotary_pos_emb_full = self.rotary_pos_emb_total(max_grid_size) + rotary_pos_emb_full = self.rotary_pos_emb_full(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> ms.Tensor: - num_grid_per_side = self.num_grid_per_side - m_size = self.spatial_merge_size - hidden_dim = self.pos_embed.embedding_dim + num_grid_per_side = self.visual.num_grid_per_side + m_size = self.visual.spatial_merge_size + hidden_dim = self.visual.pos_embed.embedding_dim outputs = [] for t, h, w in grid_thw: @@ -1880,9 +1926,9 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( indices = mint.stack([idx00, idx01, idx10, idx11], dim=0).reshape(4, -1) weights = mint.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1) - weights = weights.astype(self.dtype) + weights = weights.astype(self.visual.dtype) - embeds = self.pos_embed(indices) + embeds = self.visual.pos_embed(indices) weighted_embeds = embeds * weights p0, p1, p2, p3 = weighted_embeds.unbind(dim=0) combined = p0 + p1 + p2 + p3 @@ -1896,3 +1942,71 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( outputs.append(repeated) return mint.cat(outputs, dim=0) + + def set_model_inputs(self, + input_ids=None, + position_ids=None, + intermediate_tensors=None, + inputs_embeds=None): + if input_ids is None: + dyn_input_ids = None + else: + dyn_input_ids = ms.Tensor(shape=[None] * input_ids.ndim, + dtype=mstype.int32) + + if position_ids is None: + dyn_position_ids = None + else: + dyn_position_ids = ms.Tensor(shape=[None] * position_ids.ndim, + dtype=mstype.int32) + + if inputs_embeds is None: + dyn_inputs_embeds = None + else: + dyn_inputs_embeds = ms.Tensor(shape=[None] * inputs_embeds.ndim, + dtype=inputs_embeds.dtype) + + if intermediate_tensors is None: + dyn_intermediate_tensors = None + else: + dyn_intermediate_tensors = ms.Tensor( + shape=[None] * intermediate_tensors.ndim, + dtype=intermediate_tensors.dtype) + + block_size = self.cache_config.block_size + num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) + head_size = self.model_config.get_head_size() + kv_cache_shape = (None, block_size, num_kv_heads, head_size) + + kv_cache_dtype = (self.model_config.dtype + if self.cache_config.cache_dtype == "auto" else + self.cache_config.cache_dtype) + if kv_cache_dtype in STR_DTYPE_TO_MS_DTYPE: + kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype] + + num_layers = self.model_config.get_num_layers(self.parallel_config) + + dyn_key_cache = Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype) + dyn_value_cache = Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype) + dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)]) + dyn_value_caches = mutable( + [dyn_value_cache for _ in range(num_layers)]) + + dyn_slot_mapping = Tensor(shape=[None], dtype=mstype.int32) + dynamic_attention_mask = Tensor(shape=[None, None], + dtype=self.model_config.dtype) + dyn_batch_valid_length = Tensor(shape=[None], dtype=mstype.int32) + dyn_q_seq_lens = Tensor(shape=[None], dtype=mstype.int32) + dyn_block_tables = Tensor(shape=[None, None], dtype=mstype.int32) + dyn_deepstack_input_embeds = Tensor(shape=[None, None], + dtype=self.model_config.dtype) + + self.ready_model.set_inputs( + dyn_input_ids, dyn_position_ids, dyn_key_caches, + dyn_value_caches, dyn_slot_mapping, dynamic_attention_mask, + dyn_batch_valid_length, dyn_q_seq_lens, dyn_block_tables, + dyn_intermediate_tensors, dyn_inputs_embeds, dyn_deepstack_input_embeds) + + dynamic_hidden_states = Tensor(shape=[None, None], + dtype=self.model_config.dtype) + self.ready_lm_head.set_inputs(dynamic_hidden_states) diff --git a/vllm_mindspore/model_executor/models/vision.py b/vllm_mindspore/model_executor/models/vision.py index 11b84867bb725c69f05cd472395e425c72247022..ca37b45c57376f78c25f2f192c47f1d297db709c 100644 --- a/vllm_mindspore/model_executor/models/vision.py +++ b/vllm_mindspore/model_executor/models/vision.py @@ -9,8 +9,8 @@ def get_llm_pos_ids_for_vision( grid_ws: Tensor, ) -> Tensor: llm_pos_ids_list = [] - llm_grid_h = grid_hs[vision_idx] // spatial_merge_size - llm_grid_w = grid_ws[vision_idx] // spatial_merge_size + llm_grid_h = grid_hs[vision_idx].item() // spatial_merge_size + llm_grid_w = grid_ws[vision_idx].item() // spatial_merge_size h_index = ( mint.arange(llm_grid_h) .view(1, -1, 1) @@ -25,7 +25,6 @@ def get_llm_pos_ids_for_vision( ) t_index_tensor = ( Tensor(t_index) - .to(llm_grid_h.device) .view(-1, 1) .expand(-1, llm_grid_h * llm_grid_w) .long()