From 4c017018c9dda18f80bd6f91444b6e0f03d5ec77 Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Sat, 20 Sep 2025 10:15:58 +0800 Subject: [PATCH] fix CI --- tests/st/python/test_mcore_mix_parallel.py | 2 +- vllm_mindspore/__init__.py | 22 ++-- vllm_mindspore/config.py | 2 + vllm_mindspore/engine/arg_utils.py | 121 +----------------- .../model_executor/layers/logits_processor.py | 5 +- vllm_mindspore/model_executor/models/glm4.py | 23 ---- 6 files changed, 23 insertions(+), 152 deletions(-) diff --git a/tests/st/python/test_mcore_mix_parallel.py b/tests/st/python/test_mcore_mix_parallel.py index 52d8571c..740d8e58 100644 --- a/tests/st/python/test_mcore_mix_parallel.py +++ b/tests/st/python/test_mcore_mix_parallel.py @@ -32,7 +32,7 @@ def teardown_function(): env_manager = utils.EnvVarManager() env_vars = { "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), - "vLLM_MODEL_BACKEND": "MindFormers", + "vLLM_MS_MODEL_BACKEND": "MindFormers", "MS_ENABLE_LCCL": "off", "MS_ENABLE_TRACE_MEMORY": "off", "HCCL_OP_EXPANSION_MODE": "AIV", diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 959fff70..13da7bdd 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -17,11 +17,13 @@ # isort:skip_file +import os import sys import warnings import msadapter # noqa: F401 from vllm_mindspore.ray_patch import patch_ray +ms_backend = os.environ.get("VLLM_MS_MODEL_BACKEND") patch_ray() if "vllm" in sys.modules: @@ -106,14 +108,14 @@ from vllm_mindspore.utils import ( from vllm_mindspore.config import CacheDType, _CacheConfig, \ get_current_and_parent_class_attr_docs -# vllm.config.CacheConfig = _CacheConfig -# vllm.config.CacheDType = CacheDType -# vllm.config.get_attr_docs = get_current_and_parent_class_attr_docs +vllm.config.CacheConfig = _CacheConfig +vllm.config.CacheDType = CacheDType +vllm.config.get_attr_docs = get_current_and_parent_class_attr_docs import vllm.engine.arg_utils -# vllm.engine.arg_utils.CacheDType = CacheDType -# vllm.engine.arg_utils.CacheConfig = _CacheConfig -# vllm.engine.arg_utils.get_attr_docs = get_current_and_parent_class_attr_docs +vllm.engine.arg_utils.CacheDType = CacheDType +vllm.engine.arg_utils.CacheConfig = _CacheConfig +vllm.engine.arg_utils.get_attr_docs = get_current_and_parent_class_attr_docs vllm.utils.make_tensor_with_pad = make_tensor_with_pad vllm.utils.async_tensor_h2d = async_tensor_h2d @@ -281,6 +283,7 @@ from .config import ( _verify_quantization, _verify_args, vllm_config_post_init, + vllm_config_get_quantization_config, model_post_init, _get_and_verify_dtype, stateless_init_dp_group, @@ -289,6 +292,9 @@ from .config import ( vllm.config.ModelConfig._verify_quantization = _verify_quantization vllm.config.VllmConfig.__post_init__ = vllm_config_post_init +if ms_backend == "MindFormers": + vllm.config.VllmConfig._get_quantization_config = staticmethod( + vllm_config_get_quantization_config) vllm.config.SchedulerConfig._verify_args = _verify_args vllm.config.CompilationConfig.model_post_init = model_post_init vllm.config._get_and_verify_dtype = _get_and_verify_dtype @@ -580,10 +586,6 @@ from vllm_mindspore.model_executor.layers.quantization import ( vllm.model_executor.layers.quantization.QuantizationMethods = ( QuantizationMethods) -from vllm_mindspore.engine.arg_utils import get_kwargs - -vllm.engine.arg_utils.get_kwargs = get_kwargs - from vllm_mindspore.model_executor.model_loader.default_loader import ( _prepare_weights) from vllm.model_executor.model_loader.default_loader import DefaultModelLoader diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index 4327d8c5..e7ac388c 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -45,6 +45,8 @@ def _verify_quantization(self) -> None: # Do not verify now. return +def vllm_config_get_quantization_config(model_config, load_config): + return None def vllm_config_post_init(self): """Verify configs are valid & consistent with each other.""" diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py index 9803e158..4d6321c9 100644 --- a/vllm_mindspore/engine/arg_utils.py +++ b/vllm_mindspore/engine/arg_utils.py @@ -19,11 +19,8 @@ # limitations under the License. """Adaption for arguments utils.""" -import argparse -import json import threading -from dataclasses import MISSING, fields, is_dataclass -from typing import Any, Literal, get_origin +from typing import get_args import torch import vllm.envs as envs @@ -31,120 +28,10 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext logger = init_logger(__name__) -from pydantic import TypeAdapter, ValidationError -from vllm.config import (ConfigType, GuidedDecodingBackendV1, LoadFormat, +from vllm.config import (GuidedDecodingBackendV1, LoadFormat, ModelConfig, ParallelConfig, SchedulerConfig) -from vllm.engine.arg_utils import (EngineArgs, TypeHint, _raise_or_fallback, - _warn_or_fallback, contains_type, get_args, - get_attr_docs, get_type, get_type_hints, - human_readable_int, is_not_builtin, - literal_to_kwargs, optional_type, - parse_type, union_dict_and_str) - -from vllm_mindspore.model_executor.layers.quantization import ( - QUANTIZATION_METHODS) - - -def get_kwargs(cls: ConfigType) -> dict[str, Any]: - cls_docs = get_attr_docs(cls) - kwargs = {} - for field in fields(cls): - type_hints: set[TypeHint] = get_type_hints(field.type) - - # If the field is a dataclass, we can use the model_validate_json - generator = (th for th in type_hints if is_dataclass(th)) - dataclass_cls = next(generator, None) - - # Get the default value of the field - if field.default is not MISSING: - default = field.default - elif field.default_factory is not MISSING: - default = field.default_factory() - - # Get the help text for the field - name = field.name - help = cls_docs[name].strip() - # Escape % for argparse - help = help.replace("%", "%%") - - # Initialise the kwargs dictionary for the field - kwargs[name] = {"default": default, "help": help} - - # Set other kwargs based on the type hints - json_tip = """\n\nShould either be a valid JSON string or JSON keys - passed individually. For example, the following sets of arguments are - equivalent:\n\n - - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n - - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n""" - if dataclass_cls is not None: - - def parse_dataclass(val: str, cls=dataclass_cls) -> Any: - try: - if hasattr(cls, "from_cli"): - return cls.from_cli(val) - return TypeAdapter(cls).validate_json(val) - except ValidationError as e: - raise argparse.ArgumentTypeError(repr(e)) from e - - kwargs[name]["type"] = parse_dataclass - kwargs[name]["help"] += json_tip - elif contains_type(type_hints, bool): - # Creates --no- and -- flags - kwargs[name]["action"] = argparse.BooleanOptionalAction - elif contains_type(type_hints, Literal): - kwargs[name].update(literal_to_kwargs(type_hints)) - elif contains_type(type_hints, tuple): - type_hint = get_type(type_hints, tuple) - types = get_args(type_hint) - tuple_type = types[0] - assert all(t is tuple_type for t in types if t is not Ellipsis), ( - "All non-Ellipsis tuple elements must be of the same " - f"type. Got {types}.") - kwargs[name]["type"] = tuple_type - kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types) - elif contains_type(type_hints, list): - type_hint = get_type(type_hints, list) - types = get_args(type_hint) - assert len(types) == 1, ( - "List type must have exactly one type. Got " - f"{type_hint} with types {types}") - kwargs[name]["type"] = types[0] - kwargs[name]["nargs"] = "+" - elif contains_type(type_hints, int): - kwargs[name]["type"] = int - # Special case for large integers - if name in {"max_model_len", "max_num_batched_tokens"}: - kwargs[name]["type"] = human_readable_int - elif contains_type(type_hints, float): - kwargs[name]["type"] = float - elif (contains_type(type_hints, dict) - and (contains_type(type_hints, str) - or any(is_not_builtin(th) for th in type_hints))): - kwargs[name]["type"] = union_dict_and_str - elif contains_type(type_hints, dict): - kwargs[name]["type"] = parse_type(json.loads) - kwargs[name]["help"] += json_tip - elif (contains_type(type_hints, str) - or any(is_not_builtin(th) for th in type_hints)): - kwargs[name]["type"] = str - else: - raise ValueError( - f"Unsupported type {type_hints} for argument {name}.") - - # If the type hint was a sequence of literals, use the helper function - # to update the type and choices - if get_origin(kwargs[name].get("type")) is Literal: - kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]})) - - # If None is in type_hints, make the argument optional. - # But not if it's a bool, argparse will handle this better. - if type(None) in type_hints and not contains_type(type_hints, bool): - kwargs[name]["type"] = optional_type(kwargs[name]["type"]) - if kwargs[name].get("choices"): - kwargs[name]["choices"].append("None") - if field.name == "quantization": - kwargs[name]["choices"] = QUANTIZATION_METHODS - return kwargs +from vllm.engine.arg_utils import (EngineArgs, _raise_or_fallback, + _warn_or_fallback) def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index c14aac8f..a263ac6f 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -73,7 +73,10 @@ class LogitsProcessor(nn.Cell): # Soft cap the logits. Used in Gemma 2. self.soft_cap = soft_cap # Whether to use gather or all-gather to gather the logits. - self.use_all_gather = current_platform.use_all_gather() + if not envs.VLLM_USE_V1: + self.use_all_gather = True + else: + self.use_all_gather = current_platform.use_all_gather() if self.use_all_gather: self.tensor_model_parallel_all_gather = \ diff --git a/vllm_mindspore/model_executor/models/glm4.py b/vllm_mindspore/model_executor/models/glm4.py index 344e6c09..fb65a3ec 100644 --- a/vllm_mindspore/model_executor/models/glm4.py +++ b/vllm_mindspore/model_executor/models/glm4.py @@ -298,29 +298,6 @@ class Glm4Model(LlamaModel): weight_loader(param, loaded_weight) loaded_params.add(name) - def adjust_weight(params_dict): - if not is_310p(): - return - - target_keywords = [ - "qkv_proj.weight", - "o_proj.weight", - "gate_up_proj.weight", - "down_proj.weight", - "lm_head.weight", - ] - - for name, param in params_dict.items(): - if any(name.endswith(keyword) for keyword in target_keywords): - cast_weight = ops.auto_generate.format_cast(param, FORMAT_TYPE['nz']) - ms.runtime.synchronize() - param.set_data(cast_weight) - - if is_310p(): - ms.runtime.synchronize() - adjust_weight(params_dict) - ms.runtime.synchronize() - network_not_load = set(params_dict.keys()) - loaded_params print(f"These parameters are not loaded in the network: {network_not_load}") return loaded_params -- Gitee