From 89b3fface1f1a91602c94c22916b3018e7a8c6d9 Mon Sep 17 00:00:00 2001 From: tronzhang Date: Tue, 30 Sep 2025 16:28:36 +0800 Subject: [PATCH] donot support structed output now --- vllm_mindspore/__init__.py | 5 ++- vllm_mindspore/config.py | 33 ------------------ vllm_mindspore/v1/engine/processor.py | 50 +++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 34 deletions(-) create mode 100644 vllm_mindspore/v1/engine/processor.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index efd917b0a..58c78e076 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -282,7 +282,7 @@ vllm.v1.utils.CoreEngineActorManager.__init__ = core_engine_actor_manager_init from .config import (_verify_quantization, _verify_args, vllm_config_post_init, vllm_config_get_quantization_config, model_post_init, _get_and_verify_dtype, stateless_init_dp_group, - has_unfinished_dp, v1_process_validate_sampling_params) + has_unfinished_dp) vllm.config.ModelConfig._verify_quantization = _verify_quantization vllm.config.VllmConfig.__post_init__ = vllm_config_post_init @@ -571,8 +571,11 @@ DPAsyncMPClient.get_core_engine_for_request = get_core_engine_for_request DPAsyncMPClient.add_request_async = add_request_async DPAsyncMPClient.process_engine_outputs = staticmethod(process_engine_outputs) +from vllm_mindspore.v1.engine.processor import ( + v1_process_validate_sampling_params, v1_process_validate_structured_output) from vllm.v1.engine.processor import Processor Processor._validate_sampling_params = v1_process_validate_sampling_params +Processor._validate_structured_output = v1_process_validate_structured_output check_ready() diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index a8beae818..7f93164b6 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -34,8 +34,6 @@ from vllm.config import (_STR_DTYPE_TO_TORCH_DTYPE, CacheConfig, CompilationConfig, CompilationLevel, VllmConfig, _find_dtype, _resolve_auto_dtype, get_attr_docs) from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.sampling_params import SamplingParams from vllm.utils import random_uuid from vllm_mindspore.utils import is_310p @@ -465,34 +463,3 @@ class _CacheConfig(CacheConfig): """Data type for kv cache storage. If "auto", will use model data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports fp8 (=fp8_e4m3).""" - - -def v1_process_validate_sampling_params( - self, - params: SamplingParams, - lora_request: Optional[LoRARequest], -) -> None: - - model_config = self.vllm_config.model_config - vocab_size = model_config.get_vocab_size() - if params.top_k > vocab_size: - raise ValueError( - f"top_k cannot be greater than vocabulary size({vocab_size}), " - f"but got {params.top_k}.") - scheduler_config = self.vllm_config.scheduler_config - max_num_seqs = scheduler_config.max_num_seqs - if params.n > max_num_seqs: - raise ValueError(f"SchedulerConfig.n cannot be greater than " - f"max_num_seqs({max_num_seqs}), but got {params.n}.") - - self._validate_structured_output(params) - self._validate_logit_bias(params) - - if params.allowed_token_ids is None: - return - if not params.allowed_token_ids: - raise ValueError("allowed_token_ids is not None and empty!") - tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) - vocab_size = len(tokenizer) - if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): - raise ValueError("allowed_token_ids contains out-of-vocab token id!") diff --git a/vllm_mindspore/v1/engine/processor.py b/vllm_mindspore/v1/engine/processor.py new file mode 100644 index 000000000..16033bac2 --- /dev/null +++ b/vllm_mindspore/v1/engine/processor.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright 2025 Huawei Technologies Co., Ltd. +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Functions are adapted from +# https://github.com/vllm-project/vllm/blob/v0.9.1/vllm/v1/engine/processor.py + +from typing import Optional + +from vllm.lora.request import LoRARequest +from vllm.sampling_params import SamplingParams + + +def v1_process_validate_sampling_params( + self, + params: SamplingParams, + lora_request: Optional[LoRARequest], +) -> None: + + model_config = self.vllm_config.model_config + vocab_size = model_config.get_vocab_size() + if params.top_k > vocab_size: + raise ValueError( + f"top_k cannot be greater than vocabulary size({vocab_size}), " + f"but got {params.top_k}.") + scheduler_config = self.vllm_config.scheduler_config + max_num_seqs = scheduler_config.max_num_seqs + if params.n > max_num_seqs: + raise ValueError(f"SchedulerConfig.n cannot be greater than " + f"max_num_seqs({max_num_seqs}), but got {params.n}.") + + self._validate_structured_output(params) + self._validate_logit_bias(params) + + if params.allowed_token_ids is None: + return + if not params.allowed_token_ids: + raise ValueError("allowed_token_ids is not None and empty!") + tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) + vocab_size = len(tokenizer) + if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): + raise ValueError("allowed_token_ids contains out-of-vocab token id!") + + +def v1_process_validate_structured_output(self, + params: SamplingParams) -> None: + if not params.guided_decoding or not self.decoding_config: + return + raise ValueError( + "vLLM-MindSpore Plugin does not support structured output now.") -- Gitee