From 4c017018c9dda18f80bd6f91444b6e0f03d5ec77 Mon Sep 17 00:00:00 2001
From: huangzhuo <huangzhuo17@huawei.com>
Date: Sat, 20 Sep 2025 10:15:58 +0800
Subject: [PATCH] fix CI

---
 tests/st/python/test_mcore_mix_parallel.py    |   2 +-
 vllm_mindspore/__init__.py                    |  22 ++--
 vllm_mindspore/config.py                      |   2 +
 vllm_mindspore/engine/arg_utils.py            | 121 +-----------------
 .../model_executor/layers/logits_processor.py |   5 +-
 vllm_mindspore/model_executor/models/glm4.py  |  23 ----
 6 files changed, 23 insertions(+), 152 deletions(-)

diff --git a/tests/st/python/test_mcore_mix_parallel.py b/tests/st/python/test_mcore_mix_parallel.py
index 52d8571c..740d8e58 100644
--- a/tests/st/python/test_mcore_mix_parallel.py
+++ b/tests/st/python/test_mcore_mix_parallel.py
@@ -32,7 +32,7 @@ def teardown_function():
 env_manager = utils.EnvVarManager()
 env_vars = {
     "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
-    "vLLM_MODEL_BACKEND": "MindFormers",
+    "vLLM_MS_MODEL_BACKEND": "MindFormers",
     "MS_ENABLE_LCCL": "off",
     "MS_ENABLE_TRACE_MEMORY": "off",
     "HCCL_OP_EXPANSION_MODE": "AIV",
diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 959fff70..13da7bdd 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -17,11 +17,13 @@
 
 # isort:skip_file
 
+import os
 import sys
 import warnings
 import msadapter  # noqa: F401
 from vllm_mindspore.ray_patch import patch_ray
 
+ms_backend = os.environ.get("VLLM_MS_MODEL_BACKEND")
 patch_ray()
 
 if "vllm" in sys.modules:
@@ -106,14 +108,14 @@ from vllm_mindspore.utils import (
 from vllm_mindspore.config import CacheDType, _CacheConfig, \
     get_current_and_parent_class_attr_docs
 
-# vllm.config.CacheConfig = _CacheConfig
-# vllm.config.CacheDType = CacheDType
-# vllm.config.get_attr_docs = get_current_and_parent_class_attr_docs
+vllm.config.CacheConfig = _CacheConfig
+vllm.config.CacheDType = CacheDType
+vllm.config.get_attr_docs = get_current_and_parent_class_attr_docs
 import vllm.engine.arg_utils
 
-# vllm.engine.arg_utils.CacheDType = CacheDType
-# vllm.engine.arg_utils.CacheConfig = _CacheConfig
-# vllm.engine.arg_utils.get_attr_docs = get_current_and_parent_class_attr_docs
+vllm.engine.arg_utils.CacheDType = CacheDType
+vllm.engine.arg_utils.CacheConfig = _CacheConfig
+vllm.engine.arg_utils.get_attr_docs = get_current_and_parent_class_attr_docs
 
 vllm.utils.make_tensor_with_pad = make_tensor_with_pad
 vllm.utils.async_tensor_h2d = async_tensor_h2d
@@ -281,6 +283,7 @@ from .config import (
     _verify_quantization,
     _verify_args,
     vllm_config_post_init,
+    vllm_config_get_quantization_config,
     model_post_init,
     _get_and_verify_dtype,
     stateless_init_dp_group,
@@ -289,6 +292,9 @@ from .config import (
 
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
 vllm.config.VllmConfig.__post_init__ = vllm_config_post_init
+if ms_backend == "MindFormers":
+    vllm.config.VllmConfig._get_quantization_config = staticmethod(
+        vllm_config_get_quantization_config)
 vllm.config.SchedulerConfig._verify_args = _verify_args
 vllm.config.CompilationConfig.model_post_init = model_post_init
 vllm.config._get_and_verify_dtype = _get_and_verify_dtype
@@ -580,10 +586,6 @@ from vllm_mindspore.model_executor.layers.quantization import (
 vllm.model_executor.layers.quantization.QuantizationMethods = (
     QuantizationMethods)
 
-from vllm_mindspore.engine.arg_utils import get_kwargs
-
-vllm.engine.arg_utils.get_kwargs = get_kwargs
-
 from vllm_mindspore.model_executor.model_loader.default_loader import (
     _prepare_weights)
 from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 4327d8c5..e7ac388c 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -45,6 +45,8 @@ def _verify_quantization(self) -> None:
     # Do not verify now.
     return
 
+def vllm_config_get_quantization_config(model_config, load_config):
+    return None
 
 def vllm_config_post_init(self):
     """Verify configs are valid & consistent with each other."""
diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py
index 9803e158..4d6321c9 100644
--- a/vllm_mindspore/engine/arg_utils.py
+++ b/vllm_mindspore/engine/arg_utils.py
@@ -19,11 +19,8 @@
 # limitations under the License.
 """Adaption for arguments utils."""
 
-import argparse
-import json
 import threading
-from dataclasses import MISSING, fields, is_dataclass
-from typing import Any, Literal, get_origin
+from typing import get_args
 
 import torch
 import vllm.envs as envs
@@ -31,120 +28,10 @@ from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 
 logger = init_logger(__name__)
-from pydantic import TypeAdapter, ValidationError
-from vllm.config import (ConfigType, GuidedDecodingBackendV1, LoadFormat,
+from vllm.config import (GuidedDecodingBackendV1, LoadFormat,
                          ModelConfig, ParallelConfig, SchedulerConfig)
-from vllm.engine.arg_utils import (EngineArgs, TypeHint, _raise_or_fallback,
-                                   _warn_or_fallback, contains_type, get_args,
-                                   get_attr_docs, get_type, get_type_hints,
-                                   human_readable_int, is_not_builtin,
-                                   literal_to_kwargs, optional_type,
-                                   parse_type, union_dict_and_str)
-
-from vllm_mindspore.model_executor.layers.quantization import (
-    QUANTIZATION_METHODS)
-
-
-def get_kwargs(cls: ConfigType) -> dict[str, Any]:
-    cls_docs = get_attr_docs(cls)
-    kwargs = {}
-    for field in fields(cls):
-        type_hints: set[TypeHint] = get_type_hints(field.type)
-
-        # If the field is a dataclass, we can use the model_validate_json
-        generator = (th for th in type_hints if is_dataclass(th))
-        dataclass_cls = next(generator, None)
-
-        # Get the default value of the field
-        if field.default is not MISSING:
-            default = field.default
-        elif field.default_factory is not MISSING:
-            default = field.default_factory()
-
-        # Get the help text for the field
-        name = field.name
-        help = cls_docs[name].strip()
-        # Escape % for argparse
-        help = help.replace("%", "%%")
-
-        # Initialise the kwargs dictionary for the field
-        kwargs[name] = {"default": default, "help": help}
-
-        # Set other kwargs based on the type hints
-        json_tip = """\n\nShould either be a valid JSON string or JSON keys
-        passed individually. For example, the following sets of arguments are
-        equivalent:\n\n
-        - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
-        - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n"""
-        if dataclass_cls is not None:
-
-            def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
-                try:
-                    if hasattr(cls, "from_cli"):
-                        return cls.from_cli(val)
-                    return TypeAdapter(cls).validate_json(val)
-                except ValidationError as e:
-                    raise argparse.ArgumentTypeError(repr(e)) from e
-
-            kwargs[name]["type"] = parse_dataclass
-            kwargs[name]["help"] += json_tip
-        elif contains_type(type_hints, bool):
-            # Creates --no-<name> and --<name> flags
-            kwargs[name]["action"] = argparse.BooleanOptionalAction
-        elif contains_type(type_hints, Literal):
-            kwargs[name].update(literal_to_kwargs(type_hints))
-        elif contains_type(type_hints, tuple):
-            type_hint = get_type(type_hints, tuple)
-            types = get_args(type_hint)
-            tuple_type = types[0]
-            assert all(t is tuple_type for t in types if t is not Ellipsis), (
-                "All non-Ellipsis tuple elements must be of the same "
-                f"type. Got {types}.")
-            kwargs[name]["type"] = tuple_type
-            kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
-        elif contains_type(type_hints, list):
-            type_hint = get_type(type_hints, list)
-            types = get_args(type_hint)
-            assert len(types) == 1, (
-                "List type must have exactly one type. Got "
-                f"{type_hint} with types {types}")
-            kwargs[name]["type"] = types[0]
-            kwargs[name]["nargs"] = "+"
-        elif contains_type(type_hints, int):
-            kwargs[name]["type"] = int
-            # Special case for large integers
-            if name in {"max_model_len", "max_num_batched_tokens"}:
-                kwargs[name]["type"] = human_readable_int
-        elif contains_type(type_hints, float):
-            kwargs[name]["type"] = float
-        elif (contains_type(type_hints, dict)
-              and (contains_type(type_hints, str)
-                   or any(is_not_builtin(th) for th in type_hints))):
-            kwargs[name]["type"] = union_dict_and_str
-        elif contains_type(type_hints, dict):
-            kwargs[name]["type"] = parse_type(json.loads)
-            kwargs[name]["help"] += json_tip
-        elif (contains_type(type_hints, str)
-              or any(is_not_builtin(th) for th in type_hints)):
-            kwargs[name]["type"] = str
-        else:
-            raise ValueError(
-                f"Unsupported type {type_hints} for argument {name}.")
-
-        # If the type hint was a sequence of literals, use the helper function
-        # to update the type and choices
-        if get_origin(kwargs[name].get("type")) is Literal:
-            kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
-
-        # If None is in type_hints, make the argument optional.
-        # But not if it's a bool, argparse will handle this better.
-        if type(None) in type_hints and not contains_type(type_hints, bool):
-            kwargs[name]["type"] = optional_type(kwargs[name]["type"])
-            if kwargs[name].get("choices"):
-                kwargs[name]["choices"].append("None")
-        if field.name == "quantization":
-            kwargs[name]["choices"] = QUANTIZATION_METHODS
-    return kwargs
+from vllm.engine.arg_utils import (EngineArgs, _raise_or_fallback,
+                                   _warn_or_fallback)
 
 
 def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index c14aac8f..a263ac6f 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -73,7 +73,10 @@ class LogitsProcessor(nn.Cell):
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
-        self.use_all_gather = current_platform.use_all_gather()
+        if not envs.VLLM_USE_V1:
+            self.use_all_gather = True
+        else:
+            self.use_all_gather = current_platform.use_all_gather()
 
         if self.use_all_gather:
             self.tensor_model_parallel_all_gather = \
diff --git a/vllm_mindspore/model_executor/models/glm4.py b/vllm_mindspore/model_executor/models/glm4.py
index 344e6c09..fb65a3ec 100644
--- a/vllm_mindspore/model_executor/models/glm4.py
+++ b/vllm_mindspore/model_executor/models/glm4.py
@@ -298,29 +298,6 @@ class Glm4Model(LlamaModel):
                     weight_loader(param, loaded_weight)
                     loaded_params.add(name)
 
-        def adjust_weight(params_dict):
-            if not is_310p():
-                return
-
-            target_keywords = [
-                "qkv_proj.weight",
-                "o_proj.weight",
-                "gate_up_proj.weight",
-                "down_proj.weight",
-                "lm_head.weight",
-            ]
-
-            for name, param in params_dict.items():
-                if any(name.endswith(keyword) for keyword in target_keywords):
-                    cast_weight = ops.auto_generate.format_cast(param, FORMAT_TYPE['nz'])
-                    ms.runtime.synchronize()
-                    param.set_data(cast_weight)
-
-        if is_310p():
-            ms.runtime.synchronize()
-            adjust_weight(params_dict)
-            ms.runtime.synchronize()
-
         network_not_load = set(params_dict.keys()) - loaded_params
         print(f"These parameters are not loaded in the network: {network_not_load}")
         return loaded_params
-- 
Gitee