From 1baa59e16bfce151630381ed514159e7b1d61843 Mon Sep 17 00:00:00 2001
From: HighCloud <highcloud333@outlook.com>
Date: Fri, 4 Jul 2025 15:04:51 +0800
Subject: [PATCH 01/14] support native qwq

---
 vllm_mindspore/__init__.py                    | 10 ++
 .../distributed/communication_op.py           | 10 ++
 vllm_mindspore/distributed/parallel_state.py  | 93 +++++++++++++++++++
 .../model_executor/layers/linear.py           |  1 +
 .../model_loader/weight_utils.py              | 13 +--
 .../model_executor/models/model_base.py       | 55 +++++++++--
 vllm_mindspore/model_executor/models/qwen2.py | 30 +++++-
 vllm_mindspore/utils.py                       | 93 +++++++++++++++++++
 vllm_mindspore/v1/worker/gpu_model_runner.py  | 14 ++-
 vllm_mindspore/worker/cache_engine.py         | 18 +++-
 vllm_mindspore/worker/model_runner.py         |  5 +-
 11 files changed, 320 insertions(+), 22 deletions(-)
 create mode 100644 vllm_mindspore/distributed/parallel_state.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index dbd26f9b..31e84d38 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -351,6 +351,16 @@ RejectionSampler._smallest_positive_value = _smallest_positive_value
 RejectionSampler._smallest_positive_value.__set_name__(
     RejectionSampler, "_smallest_positive_value")
 
+import vllm.distributed.communication_op
+import vllm.worker.worker_base
+from vllm_mindspore.distributed.communication_op import cpu_broadcast_tensor_dict
+vllm.distributed.communication_op.broadcast_tensor_dict = cpu_broadcast_tensor_dict
+vllm.worker.worker_base.broadcast_tensor_dict = cpu_broadcast_tensor_dict
+
+import vllm.distributed.parallel_state
+from vllm_mindspore.distributed.parallel_state import gc_broadcast_tensor_dict
+vllm.distributed.parallel_state.GroupCoordinator.broadcast_tensor_dict = gc_broadcast_tensor_dict
+
 ######### for multi-model
 from vllm_mindspore.inputs.registry import call_hf_processor
 from vllm.inputs.registry import InputProcessingContext
diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py
index c933dc4a..475a282d 100644
--- a/vllm_mindspore/distributed/communication_op.py
+++ b/vllm_mindspore/distributed/communication_op.py
@@ -21,10 +21,20 @@
 Implement a unified communication interface for both graph and pynative mode.
 """
 
+from typing import Any, Dict, Optional, Union
+import torch
+
 from mindspore import nn, ops
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_world_size, get_tp_group)
 
+def cpu_broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor,
+                                                                Any]]] = None,
+                          src: int = 0):
+    if not torch.distributed.is_initialized():
+        return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src, group=get_tp_group().cpu_group)
+
 
 class ReduceFromModelParallelRegion(nn.Cell):
     "All reduce the input from the model parallel region."
diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py
new file mode 100644
index 00000000..697196fa
--- /dev/null
+++ b/vllm_mindspore/distributed/parallel_state.py
@@ -0,0 +1,93 @@
+import torch
+import torch.distributed
+from torch.distributed import ProcessGroup
+
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union)
+from vllm.distributed.parallel_state import _split_tensor_dict, TensorMetadata
+from vllm_mindspore.utils import atlas_inference
+
+def gc_broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if (not torch.distributed.is_initialized() or self.world_size == 1):
+            return tensor_dict
+
+        if not atlas_inference():
+            group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
+            metadata_list: List[Tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict,
+                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            self.broadcast_object(metadata_list, src=src)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=self.ranks[src],
+                                                         group=metadata_group,
+                                                         async_op=True)
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=self.ranks[src],
+                                                         group=group,
+                                                         async_op=True)
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+
+        else:
+            metadata_list = self.broadcast_object(None, src=src)
+            tensor_dict = {}
+            async_handles = []
+            for key, value in metadata_list:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(value.size,
+                                         dtype=value.dtype,
+                                         device=value.device)
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        tensor_dict[key] = tensor
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=metadata_group,
+                            async_op=True)
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=group,
+                            async_op=True)
+                    async_handles.append(handle)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py
index c81f0e32..fc56df74 100644
--- a/vllm_mindspore/model_executor/layers/linear.py
+++ b/vllm_mindspore/model_executor/layers/linear.py
@@ -607,6 +607,7 @@ class RowParallelLinear(LinearBase):
 
     def weight_loader(self, param, loaded_weight):
         tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
         input_dim = getattr(param, "input_dim", None)
         shard_size = self.input_size_per_partition
         start_idx = tp_rank * shard_size
diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py
index 6bf2dd4c..e02de0ab 100644
--- a/vllm_mindspore/model_executor/model_loader/weight_utils.py
+++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py
@@ -25,6 +25,8 @@ import mindspore as ms
 from mindspore import Parameter
 from safetensors import safe_open
 from tqdm.auto import tqdm
+from vllm_mindspore.utils import atlas_inference
+import numpy as np
 from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT,
                                                            enable_tqdm)
 
@@ -66,12 +68,11 @@ def safetensors_weights_iterator(
     ):
         with safe_open(st_file, framework="np") as f:
             for name in f.keys():  # noqa: SIM118
-                # Return a lightweight PySafeSlice object that uses file
-                # pointer offset internally to read Safetensor on demand,
-                # avoiding memory explosion. Actual data can be obtained
-                # through slicing operation like param[start:end]
-                param = f.get_slice(name)
-                yield name, param
+                # TODO： use slice
+                x = f.get_tensor(name)
+                x = x.astype(np.float16) \
+                    if (str(x.dtype) == 'bfloat16' and atlas_inference()) else x
+                yield name, ms.tensor(x)
 
 
 def default_weight_loader(param: Parameter, loaded_weight: Any) -> None:
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index f0e5621e..5fc07fd2 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -36,7 +36,7 @@ from vllm_mindspore.model_executor.models.utils import is_use_ringmla
 from vllm_mindspore.model_executor.utils import set_model_context
 from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE, create_kv_cache
 from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata
-
+from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE
 
 class AttentionWrapper:
 
@@ -47,12 +47,33 @@ class AttentionWrapper:
             vllm_config.parallel_config)
         head_size = vllm_config.model_config.get_head_size()
         num_block = 0
-        self.kv_shape = [num_block, block_size, num_kv_heads, head_size]
-        self.kv_cache = [
-            (create_kv_cache(self.kv_shape, vllm_config.model_config.dtype),
-             create_kv_cache(self.kv_shape, vllm_config.model_config.dtype))
-            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
-        ]
+
+        if atlas_inference():
+            self.kv_shape = [num_block, block_size, num_kv_heads * head_size]
+            self.kv_cache = [
+                (
+                    ops.auto_generate.format_cast(
+                        ms.mint.zeros(
+                            self.kv_shape, dtype=vllm_config.model_config.dtype
+                        ),
+                        FORMAT_TYPE['nz'],
+                    ),
+                    ops.auto_generate.format_cast(
+                        ms.mint.zeros(
+                            self.kv_shape, dtype=vllm_config.model_config.dtype
+                        ),
+                        FORMAT_TYPE['nz'],
+                    ),
+                )
+                for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+            ]
+        else:
+            self.kv_shape = [num_block, block_size, num_kv_heads, head_size]
+            self.kv_cache = [
+                (create_kv_cache(self.kv_shape, vllm_config.model_config.dtype),
+                create_kv_cache(self.kv_shape, vllm_config.model_config.dtype))
+                for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+            ]
 
         self.attn_type = AttentionType.DECODER
 
@@ -74,7 +95,22 @@ class MLAAttentionWrapper(AttentionWrapper):
         if kv_cache_dtype is None:
             kv_cache_dtype = vllm_config.model_config.dtype
         self.dtype = kv_cache_dtype
-        if not self.use_ringmla:
+        self.use_mla_op = bool(
+            vllm_config.additional_config
+            and vllm_config.additional_config.get('use_mla_op') == 1)
+        if atlas_inference():
+            self.kv_cache = [
+                (
+                    ops.auto_generate.format_cast(
+                        ms.mint.zeros(
+                            self.kv_shape, dtype=vllm_config.model_config.dtype
+                        ),
+                        FORMAT_TYPE['nz'],
+                    ),
+                )
+                for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+            ]
+        elif not self.use_mla_op or not self.use_ringmla:
             self.kv_cache = [
                 (
                     create_kv_cache(
@@ -445,7 +481,8 @@ class NativeModel(MsModelBase):
         block_size = self.cache_config.block_size
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
-        kv_cache_shape = (None, block_size, num_kv_heads, head_size)
+        kv_cache_shape = (None, block_size, num_kv_heads * head_size) if atlas_inference() \
+            else (None, block_size, num_kv_heads, head_size)
 
         kv_cache_dtype = (self.model_config.dtype
                           if self.cache_config.cache_dtype == "auto" else
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index db8e31c0..04115596 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -35,7 +35,8 @@ if TYPE_CHECKING:
 else:
     Qwen2Config = None
 
-from mindspore import Parameter, Tensor, mint, nn
+from mindspore import Parameter, Tensor, mint, nn, ops
+import mindspore as ms
 
 from vllm.attention.backends.abstract import AttentionType
 from vllm.config import CacheConfig, VllmConfig
@@ -45,6 +46,7 @@ from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.sequence import IntermediateTensors
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
+from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE
 from vllm_mindspore.attention import Attention
 from vllm_mindspore.model_executor.layers.activation import SiluAndMul
 from vllm_mindspore.model_executor.layers.layernorm import RMSNorm
@@ -408,9 +410,35 @@ class Qwen2Model(nn.Cell):
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
+                    # Norm type in weights may be f32
+                    if(loaded_weight.dtype != param.dtype):
+                        loaded_weight = loaded_weight.to(dtype=param.dtype)
                     weight_loader(param, loaded_weight)
                     loaded_params.add(name)
 
+        def adjust_weight(params_dict):
+            if not atlas_inference():
+                return
+
+            target_keywords = [
+                "qkv_proj.weight",
+                "o_proj.weight",
+                "gate_up_proj.weight",
+                "down_proj.weight",
+                # "lm_head.weight",
+            ]
+
+            for name, param in params_dict.items():
+                if any(name.endswith(keyword) for keyword in target_keywords):
+                    cast_weight = ops.auto_generate.format_cast(param, FORMAT_TYPE['nz'])
+                    ms.runtime.synchronize()
+                    param.set_data(cast_weight)
+
+        if atlas_inference():
+            ms.runtime.synchronize()
+            adjust_weight(params_dict)
+            ms.runtime.synchronize()
+
         return loaded_params
 
 
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index f3a9adbe..4da8b795 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -436,3 +436,96 @@ def ms_memory_profiling(
     result.non_torch_increase = diff_from_create.non_torch_memory
     result.profile_time = diff_profile.timestamp
     result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory  # noqa
+
+
+def view(self, *shape_or_dtype):
+    from mindspore._c_expression import typing
+    if len(shape_or_dtype) == 1 and isinstance(shape_or_dtype[0], typing.Type):
+        target_dtype = shape_or_dtype[0]
+        ori_shape = self.shape
+        target_shape = (-1, )
+        if len(ori_shape) > 1:
+            target_shape = ori_shape[:-1] + target_shape
+        out = np.frombuffer(
+            self.numpy(),
+            torch.ops.creation._TypeDict.get(target_dtype, np.float32))
+        if not out.flags.aligned:
+            out = np.require(out, requirements=["ALIGNED"])
+        if target_dtype == ms.bfloat16:
+            return ms.Tensor.from_numpy(out.astype(
+                np.float32)).astype(target_dtype).reshape(target_shape)
+        return ms.Tensor.from_numpy(out).reshape(target_shape)
+    result = []
+    if type(shape_or_dtype) is tuple:
+        for items in shape_or_dtype:
+            if not isinstance(items, int):
+                for item in items:
+                    if not isinstance(item, int):
+                        result.append(item.item())
+                    else:
+                        result.append(item)
+            else:
+                result.append(items)
+    return ms.ops.reshape(self, result)
+
+def is_version_ge(current_version, base_version):
+    """
+        return current_version >= base_version.
+        Check whether the current version is higher than or equal to the base version.
+        for current_version: 1.8.1, base_version: 1.11.0, it return False.
+    """
+    version_split_char = '.'
+    if version_split_char not in base_version or version_split_char not in current_version:
+        raise ValueError("The version string will contain the `.`."
+                         "For example, current_version 1.8.1， base_version: 1.11.0.")
+    for x, y in zip(current_version.split(version_split_char), base_version.split(version_split_char)):
+        if not x.isdigit() or not y.isdigit():
+            continue
+        if int(x) != int(y):
+            return int(x) >= int(y)
+    return True
+
+def get_ascend_soc_version():
+    """Get ascend soc version."""
+    if is_version_ge(ms.__version__, "2.2.0"):
+        from mindspore._c_expression import MSContext
+        return MSContext.get_instance().get_ascend_soc_version()
+    ascend_chip_type = os.getenv("ASCEND_CHIP_TYPE", "UNSET")
+    if ascend_chip_type not in ["910a", "910b", "UNSET"]:
+        raise EnvironmentError(f"ASCEND_CHIP_TYPE should be in ['910a', '910b'],but get {ascend_chip_type}")
+    if ascend_chip_type == "UNSET":
+        logger.info("Environment variables need to be set manually to obtain the chip type,"
+                    "which can be set as follows: \n"
+                    "For Atlas 800, run 'export ASCEND_CHIP_TYPE=910a' before the program runs.\n"
+                    "For Atlas 800T A2, run 'export ASCEND_CHIP_TYPE=910b' before the program runs.\n"
+                    "If you need to get chip information automatically, MindSpore 2.2 and above is recommended")
+    return ascend_chip_type
+
+def atlas_inference():
+    device = get_ascend_soc_version()
+    return device in ['310p', 'ascend310p']
+
+def check_ready():
+    from mindspore import set_context
+
+    # Common environment variables of predict.
+    set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
+    default_env = {
+        "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST":
+        "FlashAttentionScore,PagedAttention",
+    }
+    if atlas_inference():
+        default_env["MS_ENABLE_INTERNAL_BOOST"] = "off"
+    env_setup(default_env)
+
+    if os.getenv("MS_MEMPOOL_BLOCK_SIZE"):
+        set_context(
+            mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB")
+
+    if is_mindformers_model_backend():
+        logger.info("Run with Mindformers backend!")
+    elif is_mindone_model_backend():
+        logger.info("Run with MindONE backend!")
+    else:
+        logger.info("Run with native model backend!")
+    register_connector()
diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py
index 54fb277c..2ee11fde 100644
--- a/vllm_mindspore/v1/worker/gpu_model_runner.py
+++ b/vllm_mindspore/v1/worker/gpu_model_runner.py
@@ -41,9 +41,10 @@ from vllm_mindspore.model_executor.layers.rotary_embedding import (
     InferMRotaryEmbedding as MRotaryEmbedding)
 from vllm_mindspore.model_executor.models.utils import is_use_ringmla
 from vllm_mindspore.utils import (create_kv_cache, get_dtype_size,
-                                  get_valid_dtype, is_310p)
+                                  get_valid_dtype, is_310p, FORMAT_TYPE)
 from vllm_mindspore.v1.kv_cache_interface import MLAQuantFullAttentionSpec
 
+
 logger = init_logger(__name__)
 
 
@@ -443,6 +444,9 @@ def _reshape_kv_cache_tensors(
                 kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
                     num_blocks, kv_cache_spec.block_size,
                     kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                if atlas_inference():
+                    *dims, second_last, last = kv_cache_shape
+                    kv_cache_shape = (*dims, second_last * last)
                 try:
                     kv_cache_stride_order = self.attn_backends[
                         i].get_kv_cache_stride_order()
@@ -483,6 +487,14 @@ def _reshape_kv_cache_tensors(
                         cache_block_nz = ops.auto_generate.format_cast(
                             cache_block, 29)
                         kv_cache_layer.append(cache_block_nz)
+                    elif atlas_inference():
+                        from mindspore.common.api import _pynative_executor
+                        cache_block_nz = ops.auto_generate.format_cast(cache_block, FORMAT_TYPE['nz'])
+                        _pynative_executor.sync()
+                        import gc
+                        del cache_block
+                        gc.collect()
+                        kv_cache_layer.append(cache_block_nz)
                     else:
                         kv_cache_layer.append(cache_block)
                 kv_caches[layer_name] = mutable(tuple(kv_cache_layer))
diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py
index b57b8833..7675379c 100644
--- a/vllm_mindspore/worker/cache_engine.py
+++ b/vllm_mindspore/worker/cache_engine.py
@@ -22,17 +22,26 @@
 # isort:skip_file
 
 import mindspore as ms
-from mindspore import mutable, mint
+from mindspore import mutable, mint, ops
 
 from typing import List
 from vllm.logger import init_logger
-from vllm_mindspore.utils import MsKVCache, get_valid_dtype
+from vllm_mindspore.utils import MsKVCache, get_valid_dtype, atlas_inference, FORMAT_TYPE
 
 logger = init_logger(__name__)
 
 
 def create_block(shape, dtype, name=None, device=None):
-    blocks = mint.empty(shape, dtype=dtype, device=device)
+    from mindspore.common.api import _pynative_executor
+    blocks = mint.empty(*shape, dtype=dtype, device=device)
+    if device == "Ascend" and atlas_inference():
+        blocks_nz = ops.auto_generate.format_cast(blocks, FORMAT_TYPE['nz'])
+        _pynative_executor.sync()
+        import gc
+        del blocks
+        gc.collect()
+        ms.hal.empty_cache()
+        return blocks_nz
     return blocks
 
 
@@ -44,6 +53,9 @@ def ms_allocate_kv_cache(
     """Allocates KV cache on the specified device."""
     kv_cache_shape = self.attn_backend.get_kv_cache_shape(
         num_blocks, self.block_size, self.num_kv_heads, self.head_size)
+    if atlas_inference():
+        *dims, second_last, last = kv_cache_shape
+        kv_cache_shape = (*dims, second_last * last)
     kv_cache: List[MsKVCache] = []
 
     self.dtype = get_valid_dtype(self.dtype)
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index 7fd89fc5..6ab97c1b 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -28,7 +28,7 @@ from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceGroupMetadata
 
-from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE
+from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE, atlas_inference
 
 logger = init_logger(__name__)
 
@@ -140,7 +140,8 @@ def _dummy_run(self,
         block_size = self.cache_config.block_size
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
-        kv_shape = [0, block_size, num_kv_heads, head_size]
+        kv_shape = [0, block_size, num_kv_heads * head_size] if atlas_inference() else \
+                   [0, block_size, num_kv_heads, head_size]
         kv_caches = mutable([
             mutable(
                 (
-- 
Gitee


From 11dfb2b2431d27781bfac97f25ee241171d7d65a Mon Sep 17 00:00:00 2001
From: one_east <wanyidong@huawei.com>
Date: Thu, 24 Jul 2025 20:31:06 +0800
Subject: [PATCH 02/14] CPU bind for 910B and 910C

---
 vllm_mindspore/worker/worker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index cef21c02..24b1ed02 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -16,6 +16,7 @@
 """Adapted functions for mindspore in Worker."""
 
 import math
+import subprocess
 import os
 import subprocess
 
-- 
Gitee


From 04053ec77184c93b3231d1d8d063d2a19f3e2f8a Mon Sep 17 00:00:00 2001
From: HighCloud <highcloud333@outlook.com>
Date: Wed, 30 Jul 2025 15:24:10 +0800
Subject: [PATCH 03/14] cpu bind support 310p

---
 vllm_mindspore/worker/worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 24b1ed02..81db9bc4 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -159,6 +159,9 @@ def wrapper_worker_bind_cpu(fun):
         # Bind CPU with wrapper when workers are initializing.
         # Support 910B, 910C and 310P.
         local_rank = kwargs.get("local_rank")
+        parallel_config = kwargs.get("vllm_config").parallel_config
+        local_rank = (parallel_config.data_parallel_rank_local *
+                      parallel_config.world_size + local_rank)
         bind_cpu(local_rank)
         fun(*arg, **kwargs)
 
-- 
Gitee


From 12d6427217264a940855c8ec729e2cf8906edcd6 Mon Sep 17 00:00:00 2001
From: superxf <xufeng97@huawei.com>
Date: Wed, 23 Jul 2025 15:28:07 +0800
Subject: [PATCH 04/14] support qwq

---
 vllm_mindspore/__init__.py                    |  40 ++++++
 vllm_mindspore/config.py                      |   3 +
 vllm_mindspore/engine/arg_utils.py            | 123 +++++++++++++++++-
 .../model_executor/layers/linear.py           |  43 ++++--
 .../layers/quantization/__init__.py           |  49 +++++++
 .../layers/quantization/base_config.py        |   3 +
 .../quantization/smooth_quant_modelslim.py    |  39 +++---
 .../model_loader/default_loader.py            |  99 ++++++++++++++
 .../model_executor/model_loader/utils.py      |  55 ++++++++
 .../model_loader/weight_utils.py              | 109 +++++++++++++++-
 vllm_mindspore/model_executor/models/qwen2.py |   5 +
 11 files changed, 527 insertions(+), 41 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/model_loader/default_loader.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 31e84d38..7073717d 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -561,6 +561,46 @@ sys.modules["vllm.entrypoints.openai.tool_parsers.deepseekv3_tool_parser"] = (
 from vllm_mindspore.entrypoints.__main__ import (
     patch_server_run_api_server_worker_proc, )
 
+from vllm_mindspore.model_executor.model_loader.utils import (
+    process_weights_after_loading)
+
+vllm.model_executor.model_loader.utils.process_weights_after_loading = (
+    process_weights_after_loading)
+vllm.model_executor.model_loader.base_loader.process_weights_after_loading = (
+    process_weights_after_loading)
+
+from vllm_mindspore.model_executor.layers.quantization import (
+    get_quantization_config)
+
+vllm.model_executor.layers.quantization.get_quantization_config = (
+    get_quantization_config)
+vllm.config.get_quantization_config = get_quantization_config
+vllm.model_executor.model_loader.weight_utils.get_quantization_config = (
+    get_quantization_config)
+
+from vllm_mindspore.model_executor.model_loader.weight_utils import (
+    get_quant_config)
+
+vllm.model_executor.model_loader.weight_utils.get_quant_config = (
+    get_quant_config)
+vllm.config.get_quant_config = get_quant_config
+
+from vllm_mindspore.model_executor.layers.quantization import (
+    QuantizationMethods)
+
+vllm.model_executor.layers.quantization.QuantizationMethods = (
+    QuantizationMethods)
+
+from vllm_mindspore.engine.arg_utils import get_kwargs
+
+vllm.engine.arg_utils.get_kwargs = get_kwargs
+
+from vllm_mindspore.model_executor.model_loader.default_loader import (
+    _prepare_weights)
+from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
+
+DefaultModelLoader._prepare_weights = _prepare_weights
+
 patch_server_run_api_server_worker_proc()
 
 from vllm_mindspore.model_executor.models.registry import _normalize_archs
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 7f93164b..5f7da8c5 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -264,6 +264,9 @@ def _get_and_verify_dtype(
     if torch_dtype in _STR_DTYPE_TO_TORCH_DTYPE:
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[torch_dtype]
 
+    if is_310p() and torch_dtype == torch.bfloat16:
+        return torch.float16
+
     return torch_dtype
 
 
diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py
index f7a3d6aa..9803e158 100644
--- a/vllm_mindspore/engine/arg_utils.py
+++ b/vllm_mindspore/engine/arg_utils.py
@@ -19,19 +19,132 @@
 # limitations under the License.
 """Adaption for arguments utils."""
 
+import argparse
+import json
 import threading
-from typing import get_args
+from dataclasses import MISSING, fields, is_dataclass
+from typing import Any, Literal, get_origin
 
 import torch
 import vllm.envs as envs
-from vllm.config import (GuidedDecodingBackendV1, LoadFormat, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
-from vllm.engine.arg_utils import (EngineArgs, _raise_or_fallback,
-                                   _warn_or_fallback)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 
 logger = init_logger(__name__)
+from pydantic import TypeAdapter, ValidationError
+from vllm.config import (ConfigType, GuidedDecodingBackendV1, LoadFormat,
+                         ModelConfig, ParallelConfig, SchedulerConfig)
+from vllm.engine.arg_utils import (EngineArgs, TypeHint, _raise_or_fallback,
+                                   _warn_or_fallback, contains_type, get_args,
+                                   get_attr_docs, get_type, get_type_hints,
+                                   human_readable_int, is_not_builtin,
+                                   literal_to_kwargs, optional_type,
+                                   parse_type, union_dict_and_str)
+
+from vllm_mindspore.model_executor.layers.quantization import (
+    QUANTIZATION_METHODS)
+
+
+def get_kwargs(cls: ConfigType) -> dict[str, Any]:
+    cls_docs = get_attr_docs(cls)
+    kwargs = {}
+    for field in fields(cls):
+        type_hints: set[TypeHint] = get_type_hints(field.type)
+
+        # If the field is a dataclass, we can use the model_validate_json
+        generator = (th for th in type_hints if is_dataclass(th))
+        dataclass_cls = next(generator, None)
+
+        # Get the default value of the field
+        if field.default is not MISSING:
+            default = field.default
+        elif field.default_factory is not MISSING:
+            default = field.default_factory()
+
+        # Get the help text for the field
+        name = field.name
+        help = cls_docs[name].strip()
+        # Escape % for argparse
+        help = help.replace("%", "%%")
+
+        # Initialise the kwargs dictionary for the field
+        kwargs[name] = {"default": default, "help": help}
+
+        # Set other kwargs based on the type hints
+        json_tip = """\n\nShould either be a valid JSON string or JSON keys
+        passed individually. For example, the following sets of arguments are
+        equivalent:\n\n
+        - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
+        - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n"""
+        if dataclass_cls is not None:
+
+            def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
+                try:
+                    if hasattr(cls, "from_cli"):
+                        return cls.from_cli(val)
+                    return TypeAdapter(cls).validate_json(val)
+                except ValidationError as e:
+                    raise argparse.ArgumentTypeError(repr(e)) from e
+
+            kwargs[name]["type"] = parse_dataclass
+            kwargs[name]["help"] += json_tip
+        elif contains_type(type_hints, bool):
+            # Creates --no-<name> and --<name> flags
+            kwargs[name]["action"] = argparse.BooleanOptionalAction
+        elif contains_type(type_hints, Literal):
+            kwargs[name].update(literal_to_kwargs(type_hints))
+        elif contains_type(type_hints, tuple):
+            type_hint = get_type(type_hints, tuple)
+            types = get_args(type_hint)
+            tuple_type = types[0]
+            assert all(t is tuple_type for t in types if t is not Ellipsis), (
+                "All non-Ellipsis tuple elements must be of the same "
+                f"type. Got {types}.")
+            kwargs[name]["type"] = tuple_type
+            kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
+        elif contains_type(type_hints, list):
+            type_hint = get_type(type_hints, list)
+            types = get_args(type_hint)
+            assert len(types) == 1, (
+                "List type must have exactly one type. Got "
+                f"{type_hint} with types {types}")
+            kwargs[name]["type"] = types[0]
+            kwargs[name]["nargs"] = "+"
+        elif contains_type(type_hints, int):
+            kwargs[name]["type"] = int
+            # Special case for large integers
+            if name in {"max_model_len", "max_num_batched_tokens"}:
+                kwargs[name]["type"] = human_readable_int
+        elif contains_type(type_hints, float):
+            kwargs[name]["type"] = float
+        elif (contains_type(type_hints, dict)
+              and (contains_type(type_hints, str)
+                   or any(is_not_builtin(th) for th in type_hints))):
+            kwargs[name]["type"] = union_dict_and_str
+        elif contains_type(type_hints, dict):
+            kwargs[name]["type"] = parse_type(json.loads)
+            kwargs[name]["help"] += json_tip
+        elif (contains_type(type_hints, str)
+              or any(is_not_builtin(th) for th in type_hints)):
+            kwargs[name]["type"] = str
+        else:
+            raise ValueError(
+                f"Unsupported type {type_hints} for argument {name}.")
+
+        # If the type hint was a sequence of literals, use the helper function
+        # to update the type and choices
+        if get_origin(kwargs[name].get("type")) is Literal:
+            kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
+
+        # If None is in type_hints, make the argument optional.
+        # But not if it's a bool, argparse will handle this better.
+        if type(None) in type_hints and not contains_type(type_hints, bool):
+            kwargs[name]["type"] = optional_type(kwargs[name]["type"])
+            if kwargs[name].get("choices"):
+                kwargs[name]["choices"].append("None")
+        if field.name == "quantization":
+            kwargs[name]["choices"] = QUANTIZATION_METHODS
+    return kwargs
 
 
 def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py
index fc56df74..ecf6298c 100644
--- a/vllm_mindspore/model_executor/layers/linear.py
+++ b/vllm_mindspore/model_executor/layers/linear.py
@@ -18,7 +18,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Linear methods for quantized linear layers. """
-
 from abc import abstractmethod
 from typing import Optional, Union
 
@@ -343,14 +342,23 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             assert loaded_shard_id < len(self.output_sizes)
             shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
             shard_size = self.output_sizes[loaded_shard_id] // tp_size
-
-        start_idx = tp_rank * shard_size
-        loaded_weight = split_loaded_weight(loaded_weight, output_dim,
-                                            start_idx, shard_size)
-
-        assert loaded_weight.shape == (shard_size, param.shape[1])
-        param[shard_offset:shard_offset +
-              shard_size, :] = ms.from_numpy(loaded_weight)
+            param_data = param.data
+            param_data = param_data.narrow(output_dim, shard_offset,
+                                           shard_size)
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size).contiguous()
+            assert param_data.shape == loaded_weight.shape
+            if len(loaded_weight.shape) == 2:
+                param[shard_offset:shard_offset +
+                      shard_size, :] = loaded_weight
+            else:
+                param[shard_offset:shard_offset + shard_size] = loaded_weight
+        else:
+            assert param.shape == loaded_weight.shape
+            if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16:
+                loaded_weight = loaded_weight.astype(ms.float16)
+            param.set_data(loaded_weight.contiguous())
 
 
 class QKVParallelLinear(ColumnParallelLinear):
@@ -433,6 +441,11 @@ class QKVParallelLinear(ColumnParallelLinear):
                       loaded_weight,
                       loaded_shard_id: Optional[str] = None):
         output_dim = getattr(param, "output_dim", None)
+        if output_dim is None:
+            if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16:
+                loaded_weight = loaded_weight.astype(ms.float16)
+            param.set_data(loaded_weight.contiguous())
+            return
         tp_rank = get_tensor_model_parallel_rank()
 
         # QKV loaded weight is already fused on disk (qkv safetensors).
@@ -482,11 +495,13 @@ class QKVParallelLinear(ColumnParallelLinear):
                                             start_idx, shard_size)
         loaded_weight = ms.from_numpy(loaded_weight)
 
-        if param.name.endswith("weight"):
-            assert loaded_weight.shape == (shard_size, param.shape[1])
-        if param.name.endswith("bias"):
-            assert loaded_weight.shape == (shard_size, )
-        param[shard_offset:shard_offset + shard_size] = loaded_weight
+        loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                             shard_size).contiguous()
+        assert param_data.shape == loaded_weight.shape
+        if len(loaded_weight.shape) == 2:
+            param[shard_offset:shard_offset + shard_size, :] = loaded_weight
+        else:
+            param[shard_offset:shard_offset + shard_size] = loaded_weight
 
 
 class RowParallelLinear(LinearBase):
diff --git a/vllm_mindspore/model_executor/layers/quantization/__init__.py b/vllm_mindspore/model_executor/layers/quantization/__init__.py
index e69de29b..6c9e2e41 100644
--- a/vllm_mindspore/model_executor/layers/quantization/__init__.py
+++ b/vllm_mindspore/model_executor/layers/quantization/__init__.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+from typing import Literal, get_args
+
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+QuantizationMethods = Literal["smoothquant"]
+QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
+
+# The customized quantization methods which will be added to this dict.
+_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
+
+
+def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
+    if quantization not in QUANTIZATION_METHODS:
+        raise ValueError(f"Invalid quantization method: {quantization}")
+
+    # lazy import to avoid triggering `torch.compile` too early
+    from .smooth_quant_modelslim import SmoothQuantModelSlimConfig
+    method_to_config: dict[str, type[QuantizationConfig]] = {
+        "smoothquant": SmoothQuantModelSlimConfig
+    }
+    # Update the `method_to_config` with customized quantization methods.
+    method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
+
+    return method_to_config[quantization]
+
+
+__all__ = [
+    "QuantizationConfig", "get_quantization_config", "QUANTIZATION_METHODS",
+    "QuantizationMethods"
+]
diff --git a/vllm_mindspore/model_executor/layers/quantization/base_config.py b/vllm_mindspore/model_executor/layers/quantization/base_config.py
index 37144a43..5728702d 100644
--- a/vllm_mindspore/model_executor/layers/quantization/base_config.py
+++ b/vllm_mindspore/model_executor/layers/quantization/base_config.py
@@ -142,6 +142,9 @@ class QuantizationConfig(ABC):
         """
         raise NotImplementedError
 
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        return None
+
 
 def method_has_implemented_embedding(
         method_class: type[QuantizeMethodBase]) -> bool:
diff --git a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py
index 96697046..cad2f322 100644
--- a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py
+++ b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py
@@ -14,12 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
 from typing import Any, Optional
 
 import mindspore
 import numpy as np
-from mindspore import Parameter, Tensor, mint
+from mindspore import Parameter, Tensor, ops
 from mindspore.common.initializer import initializer
 from mindspore.ops.auto_generate import (DynamicQuantExt, GroupedMatmul,
                                          GroupedMatmulV4, QuantBatchMatmul)
@@ -107,7 +106,7 @@ class SmoothQuantModelSlimConfig(QuantizationConfig):
             return BaseKVCacheMethod(self)
 
         if isinstance(layer, LinearBase):
-            if quant_config and quant_config.lower() == 'w8a8':
+            if quant_config and quant_config.lower() == 'w8a8s':
                 return A8W8LinearMethod(self)
             if quant_config and quant_config.lower() == 'w8a8_dyn':
                 self.dynamic_quant = True
@@ -224,12 +223,12 @@ class A8W8LinearMethod(LinearMethodBase):
                                              self.params_dtype),
                                  name="input_offset")
         if self.is_310p:
-            quant_bias_ = Parameter(initializer(
+            quant_bias = Parameter(initializer(
                 'zeros', (self.output_size_per_partition //
                           self.quant_config.pack_factor, ), mindspore.int32),
-                                    name="quant_bias_")
+                                   name="quant_bias")
         else:
-            quant_bias_ = None
+            quant_bias = None
 
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         set_weight_attrs(weight_scale, {"output_dim": 0})
@@ -240,11 +239,12 @@ class A8W8LinearMethod(LinearMethodBase):
         set_weight_attrs(deq_scale, extra_weight_attrs)
         set_weight_attrs(input_scale, extra_weight_attrs)
         set_weight_attrs(input_offset, extra_weight_attrs)
-        if quant_bias_ is not None:
-            set_weight_attrs(quant_bias_, extra_weight_attrs)
-            layer.insert_param_to_cell("quant_bias_", quant_bias_)
+        if quant_bias is not None:
+            set_weight_attrs(quant_bias, extra_weight_attrs)
+            set_weight_attrs(quant_bias, {"output_dim": 0})
+            layer.insert_param_to_cell("quant_bias", quant_bias)
         else:
-            layer.quant_bias_ = None
+            layer.quant_bias = None
 
         layer.insert_param_to_cell("weight", weight)
         layer.insert_param_to_cell("weight_scale", weight_scale)
@@ -285,7 +285,7 @@ class A8W8LinearMethod(LinearMethodBase):
         input_offset = Parameter(initializer('zeros', input_scale_shape,
                                              self.params_dtype),
                                  name="input_offset")
-        quant_bias_ = None
+        quant_bias = None
         set_weight_attrs(weight, {
             "ep_dim": 0,
             "input_dim": 1,
@@ -300,11 +300,11 @@ class A8W8LinearMethod(LinearMethodBase):
         set_weight_attrs(deq_scale, extra_weight_attrs)
         set_weight_attrs(input_scale, extra_weight_attrs)
         set_weight_attrs(input_offset, extra_weight_attrs)
-        if quant_bias_ is not None:
-            set_weight_attrs(quant_bias_, extra_weight_attrs)
-            layer.insert_param_to_cell("quant_bias_", quant_bias_)
+        if quant_bias is not None:
+            set_weight_attrs(quant_bias, extra_weight_attrs)
+            layer.insert_param_to_cell("quant_bias", quant_bias)
         else:
-            layer.quant_bias_ = None
+            layer.quant_bias = None
 
         layer.insert_param_to_cell("weight", weight)
         layer.insert_param_to_cell("weight_scale", weight_scale)
@@ -313,8 +313,7 @@ class A8W8LinearMethod(LinearMethodBase):
         layer.insert_param_to_cell("input_offset", input_offset)
 
     def process_weights_after_loading(self, layer: mindspore.nn.Cell) -> None:
-        input_offset = np.array([0])
-        params_dtype = layer.params_dtype
+        input_offset = layer.input_offset.asnumpy()
         layer.input_offset = Parameter(Tensor(input_offset,
                                               dtype=mindspore.int8),
                                        name=layer.input_offset.name)
@@ -335,7 +334,7 @@ class A8W8LinearMethod(LinearMethodBase):
                 layer.weight_scale = Parameter(Tensor(
                     weight_scale, dtype=layer.weight_scale.dtype),
                                                name=layer.weight_scale.name)
-        if not self.is_310p and params_dtype is mindspore.bfloat16:
+        if not self.is_310p and self.params_dtype is mindspore.bfloat16:
             deq_scale = layer.deq_scale.asnumpy().astype(np.int32).view(
                 np.float32)
             layer.deq_scale = Parameter(Tensor(deq_scale,
@@ -373,10 +372,10 @@ class A8W8LinearMethod(LinearMethodBase):
                                  group_type=0,
                                  group_list_type=0 if cumsum_flag else 1)[0]
         else:
-            qx = self.matmul(qx, weight, deq_scale, None, layer.quant_bias_,
+            qx = self.matmul(qx, weight, deq_scale, None, layer.quant_bias,
                              None)
         if bias is not None:
-            qx = mint.add(qx, bias)
+            qx = self.bias_add(qx, bias)
         qx = qx.reshape(output_shape)
         return qx
 
diff --git a/vllm_mindspore/model_executor/model_loader/default_loader.py b/vllm_mindspore/model_executor/model_loader/default_loader.py
new file mode 100644
index 00000000..dbd6ea8b
--- /dev/null
+++ b/vllm_mindspore/model_executor/model_loader/default_loader.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+import glob
+import os
+from typing import Optional
+
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
+from vllm.config import LoadFormat
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.model_loader.weight_utils import (
+    download_safetensors_index_file_from_hf, download_weights_from_hf,
+    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference)
+
+
+def _prepare_weights(
+    self,
+    model_name_or_path: str,
+    revision: Optional[str],
+    fall_back_to_pt: bool,
+    allow_patterns_overrides: Optional[list[str]],
+) -> tuple[str, list[str], bool]:
+    """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+    model_name_or_path = (self._maybe_download_from_modelscope(
+        model_name_or_path, revision) or model_name_or_path)
+
+    is_local = os.path.isdir(model_name_or_path)
+    load_format = self.load_config.load_format
+    use_safetensors = False
+    index_file = SAFE_WEIGHTS_INDEX_NAME
+    # Some quantized models use .pt files for storing the weights.
+    if load_format == LoadFormat.AUTO:
+        allow_patterns = ["*.safetensors", "*.bin"]
+    elif (load_format == LoadFormat.SAFETENSORS
+          or load_format == LoadFormat.FASTSAFETENSORS):
+        use_safetensors = True
+        allow_patterns = ["*.safetensors"]
+    elif load_format == LoadFormat.MISTRAL:
+        use_safetensors = True
+        allow_patterns = ["consolidated*.safetensors"]
+        index_file = "consolidated.safetensors.index.json"
+    elif load_format == LoadFormat.PT:
+        allow_patterns = ["*.pt"]
+    elif load_format == LoadFormat.NPCACHE:
+        allow_patterns = ["*.bin"]
+    else:
+        raise ValueError(f"Unknown load_format: {load_format}")
+
+    if fall_back_to_pt:
+        allow_patterns += ["*.pt"]
+
+    if allow_patterns_overrides is not None:
+        allow_patterns = allow_patterns_overrides
+
+    if not is_local:
+        hf_folder = download_weights_from_hf(
+            model_name_or_path,
+            self.load_config.download_dir,
+            allow_patterns,
+            revision,
+            ignore_patterns=self.load_config.ignore_patterns,
+        )
+    else:
+        hf_folder = model_name_or_path
+    hf_weights_files: list[str] = []
+    for pattern in allow_patterns:
+        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+        if len(hf_weights_files) == 0:
+            tp_rank = get_tensor_model_parallel_rank()
+            hf_weights_files += glob.glob(
+                os.path.join(hf_folder, f"rank_{tp_rank}", pattern))
+        if len(hf_weights_files) > 0:
+            if pattern == "*.safetensors":
+                use_safetensors = True
+            break
+    if use_safetensors:
+        # For models like Mistral-7B-Instruct-v0.3
+        # there are both sharded safetensors files and a consolidated
+        # safetensors file. Using both breaks.
+        # Here, we download the `model.safetensors.index.json` and filter
+        # any files not found in the index.
+        if not is_local:
+            download_safetensors_index_file_from_hf(
+                model_name_or_path,
+                index_file,
+                self.load_config.download_dir,
+                revision,
+            )
+        hf_weights_files = filter_duplicate_safetensors_files(
+            hf_weights_files, hf_folder, index_file)
+    else:
+        hf_weights_files = filter_files_not_needed_for_inference(
+            hf_weights_files)
+
+    if len(hf_weights_files) == 0:
+        raise RuntimeError(
+            f"Cannot find any model weights with `{model_name_or_path}`")
+
+    return hf_folder, hf_weights_files, use_safetensors
diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py
index fb155027..c8b6bc16 100644
--- a/vllm_mindspore/model_executor/model_loader/utils.py
+++ b/vllm_mindspore/model_executor/model_loader/utils.py
@@ -22,8 +22,12 @@
 from mindspore import nn
 from vllm.config import ModelConfig, ModelImpl
 from vllm.model_executor.model_loader.utils import logger
+from vllm.attention import Attention
+
 from vllm.model_executor.models import ModelRegistry
 
+from vllm_mindspore.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase)
 from vllm_mindspore.model_executor.models.registry import (
     AUTO_SELECT_FIXED_MODEL, MindSporeModelRegistry, mcore_support_list,
     mf_supported, mindone_supported)
@@ -188,3 +192,54 @@ def get_ms_model_architecture(
         raise RecursionError("MindSpore unsupported reward model task now!")
 
     return model_cls, arch
+
+
+def convert_uint64_to_fp32(arr: np.ndarray):
+    arr_fp32 = arr.view(np.float32)
+    output = arr_fp32[:, :, 0::2]
+    return output
+
+
+def np_int4data_pack_to_int8_3d(np_data):
+    np_data = np_data.astype(np.int8)
+    np_data &= 0x000F
+    np_data[::, ::, 0::2] <<= 0
+    np_data[::, ::, 1::2] <<= 4
+    np_int4_data = np_data[::, ::, 0::2] | np_data[::, ::, 1::2]
+    return np_int4_data
+
+
+def unpack_int8_to_int4_3d(packed_data):
+    low_nibbles = (packed_data & 0x0F).astype(np.uint8)
+    high_nibbles = ((packed_data >> 4) & 0x0F).astype(np.uint8)
+
+    unpacked = np.empty((*packed_data.shape[:2], packed_data.shape[2] * 2),
+                        dtype=np.uint8)
+    unpacked[..., 0::2] = low_nibbles
+    unpacked[..., 1::2] = high_nibbles
+
+    return unpacked
+
+
+def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+                                  target_device: torch.device) -> None:
+    for _, module in model.named_modules():
+        quant_method = getattr(module, "quant_method", None)
+        if isinstance(quant_method, QuantizeMethodBase):
+            # # When quant methods need to process weights after loading
+            # # (for repacking, quantizing, etc), they expect parameters
+            # # to be on the global target device. This scope is for the
+            # # case where cpu offloading is used, where we will move the
+            # # parameters onto device for processing and back off after.
+            # with device_loading_context(module, target_device):
+            quant_method.process_weights_after_loading(module)
+
+    # Currently only used by MLA.
+    # NOTE: This intentionally happens after other modules so we can easily
+    # decompress the weights for MLA.
+    for _, module in model.named_modules():
+        if isinstance(module, Attention) and \
+            hasattr(module, "process_weights_after_loading"):
+            # TODO(lucas): see if there is a way to unify the signatures
+            # of process_weights_after_loading
+            module.process_weights_after_loading(model_config.dtype)
diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py
index e02de0ab..4e535c01 100644
--- a/vllm_mindspore/model_executor/model_loader/weight_utils.py
+++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py
@@ -17,16 +17,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import glob
+import json
+import os
 from collections.abc import Generator
 from typing import Any
 
+import huggingface_hub
 import mindspore as ms
+import numpy as np
+from huggingface_hub import snapshot_download
 from mindspore import Parameter
 from safetensors import safe_open
 from tqdm.auto import tqdm
+from vllm.config import LoadConfig
+from vllm.model_executor.model_loader.weight_utils import (DisabledTqdm,
+                                                           get_lock)
+
+from vllm_mindspore.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm_mindspore.platforms.ascend import ModelConfig
 from vllm_mindspore.utils import atlas_inference
-import numpy as np
 from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT,
                                                            enable_tqdm)
 
@@ -79,3 +90,97 @@ def default_weight_loader(param: Parameter, loaded_weight: Any) -> None:
     """Default weight loader."""
     loaded_weight = loaded_weight[:]
     param.set_data(ms.Tensor(loaded_weight, dtype=param.dtype))
+
+
+def get_quant_config(model_config: ModelConfig,
+                     load_config: LoadConfig) -> QuantizationConfig:
+
+    from vllm_mindspore.model_executor.layers.quantization import (
+        get_quantization_config)
+    quant_cls = get_quantization_config(model_config.quantization)
+
+    # GGUF doesn't have config file
+    if model_config.quantization == "gguf":
+        return quant_cls.from_config({})
+
+    # Read the quantization config from the HF model config, if available.
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
+                              None)
+    # some vision model may keep quantization_config in their text_config
+    hf_text_config = getattr(model_config.hf_config, "text_config", None)
+    if hf_quant_config is None and hf_text_config is not None:
+        hf_quant_config = getattr(hf_text_config, "quantization_config", None)
+    if hf_quant_config is None:
+        # compressed-tensors uses a compressions_config
+        hf_quant_config = getattr(model_config.hf_config, "compression_config",
+                                  None)
+    if hf_quant_config is not None:
+        if os.path.isdir(model_config.model):
+            quant_config_file = os.path.join(
+                model_config.model,
+                quant_cls.get_config_filenames()[0])
+            with open(quant_config_file) as f:
+                quant_config = json.load(f)
+        return quant_cls.from_config(hf_quant_config | quant_config)
+
+    # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
+    if model_config.quantization == "bitsandbytes":
+        if (not load_config.model_loader_extra_config
+                or "qlora_adapter_name_or_path"
+                not in load_config.model_loader_extra_config):
+            return quant_cls.from_config({"adapter_name_or_path": ""})
+        model_name_or_path = load_config.model_loader_extra_config[
+            "qlora_adapter_name_or_path"]
+
+    else:
+        model_name_or_path = model_config.model
+    is_local = os.path.isdir(model_name_or_path)
+    if not is_local:
+        # Download the config files.
+        with get_lock(model_name_or_path, load_config.download_dir):
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                revision=model_config.revision,
+                allow_patterns="*.json",
+                cache_dir=load_config.download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                tqdm_class=DisabledTqdm,
+            )
+    else:
+        hf_folder = model_name_or_path
+
+    possible_config_filenames = quant_cls.get_config_filenames()
+
+    # If the quantization config is not found, use the default config.
+    if not possible_config_filenames:
+        return quant_cls()
+
+    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+
+    quant_config_files = [
+        f for f in config_files if any(
+            f.endswith(x) for x in possible_config_filenames)
+    ]
+    if len(quant_config_files) == 0:
+        raise ValueError(
+            f"Cannot find the config file for {model_config.quantization}")
+    if len(quant_config_files) > 1:
+        raise ValueError(
+            f"Found multiple config files for {model_config.quantization}: "
+            f"{quant_config_files}")
+
+    quant_config_file = quant_config_files[0]
+    with open(quant_config_file) as f:
+        config = json.load(f)
+
+        if model_config.quantization == "bitsandbytes":
+            config["adapter_name_or_path"] = model_name_or_path
+        elif model_config.quantization == "modelopt":
+            if config["producer"]["name"] == "modelopt":
+                return quant_cls.from_config(config)
+            else:
+                raise ValueError(
+                    f"Unsupported quantization config"
+                    f" found for {model_config.quantization} in {f}.")
+
+    return quant_cls.from_config(config)
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 04115596..b9ddd507 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -45,6 +45,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.sequence import IntermediateTensors
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.distributed import get_tensor_model_parallel_rank
 
 from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE
 from vllm_mindspore.attention import Attention
@@ -377,6 +378,10 @@ class Qwen2Model(nn.Cell):
         ]
 
         for name, loaded_weight in weights:
+            if get_tensor_model_parallel_rank(
+            ) > 0 and "o_proj.quant_bias" in name:
+                continue
+
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
-- 
Gitee


From 2449b6755a07fe78724fe0b918bd9d13a8914af1 Mon Sep 17 00:00:00 2001
From: superxf <xufeng97@huawei.com>
Date: Wed, 30 Jul 2025 16:47:52 +0800
Subject: [PATCH 05/14] fix new branch

---
 .../model_executor/layers/linear.py           | 43 ++++++-------------
 .../model_loader/weight_utils.py              | 23 +++++++++-
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py
index ecf6298c..109118cf 100644
--- a/vllm_mindspore/model_executor/layers/linear.py
+++ b/vllm_mindspore/model_executor/layers/linear.py
@@ -342,23 +342,14 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             assert loaded_shard_id < len(self.output_sizes)
             shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
             shard_size = self.output_sizes[loaded_shard_id] // tp_size
-            param_data = param.data
-            param_data = param_data.narrow(output_dim, shard_offset,
-                                           shard_size)
-            start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size).contiguous()
-            assert param_data.shape == loaded_weight.shape
-            if len(loaded_weight.shape) == 2:
-                param[shard_offset:shard_offset +
-                      shard_size, :] = loaded_weight
-            else:
-                param[shard_offset:shard_offset + shard_size] = loaded_weight
-        else:
-            assert param.shape == loaded_weight.shape
-            if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16:
-                loaded_weight = loaded_weight.astype(ms.float16)
-            param.set_data(loaded_weight.contiguous())
+
+        start_idx = tp_rank * shard_size
+        loaded_weight = split_loaded_weight(loaded_weight, output_dim,
+                                            start_idx, shard_size)
+        if param.name.endswith("weight"):
+            assert loaded_weight.shape == (shard_size, param.shape[1])
+        param[shard_offset:shard_offset +
+              shard_size] = ms.from_numpy(loaded_weight)
 
 
 class QKVParallelLinear(ColumnParallelLinear):
@@ -441,11 +432,6 @@ class QKVParallelLinear(ColumnParallelLinear):
                       loaded_weight,
                       loaded_shard_id: Optional[str] = None):
         output_dim = getattr(param, "output_dim", None)
-        if output_dim is None:
-            if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16:
-                loaded_weight = loaded_weight.astype(ms.float16)
-            param.set_data(loaded_weight.contiguous())
-            return
         tp_rank = get_tensor_model_parallel_rank()
 
         # QKV loaded weight is already fused on disk (qkv safetensors).
@@ -495,13 +481,11 @@ class QKVParallelLinear(ColumnParallelLinear):
                                             start_idx, shard_size)
         loaded_weight = ms.from_numpy(loaded_weight)
 
-        loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                             shard_size).contiguous()
-        assert param_data.shape == loaded_weight.shape
-        if len(loaded_weight.shape) == 2:
-            param[shard_offset:shard_offset + shard_size, :] = loaded_weight
-        else:
-            param[shard_offset:shard_offset + shard_size] = loaded_weight
+        if param.name.endswith("weight"):
+            assert loaded_weight.shape == (shard_size, param.shape[1])
+        if param.name.endswith("bias"):
+            assert loaded_weight.shape == (shard_size, )
+        param[shard_offset:shard_offset + shard_size] = loaded_weight
 
 
 class RowParallelLinear(LinearBase):
@@ -622,7 +606,6 @@ class RowParallelLinear(LinearBase):
 
     def weight_loader(self, param, loaded_weight):
         tp_rank = get_tensor_model_parallel_rank()
-        param_data = param.data
         input_dim = getattr(param, "input_dim", None)
         shard_size = self.input_size_per_partition
         start_idx = tp_rank * shard_size
diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py
index 4e535c01..4a0fdcf8 100644
--- a/vllm_mindspore/model_executor/model_loader/weight_utils.py
+++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py
@@ -31,7 +31,9 @@ from mindspore import Parameter
 from safetensors import safe_open
 from tqdm.auto import tqdm
 from vllm.config import LoadConfig
-from vllm.model_executor.model_loader.weight_utils import (DisabledTqdm,
+from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT,
+                                                           DisabledTqdm,
+                                                           enable_tqdm,
                                                            get_lock)
 
 from vllm_mindspore.model_executor.layers.quantization.base_config import (
@@ -63,6 +65,11 @@ def split_loaded_weight(loaded_weight, shard_dim, start_idx, shard_size):
         loaded_weight = loaded_weight[:, :, start_idx:end_idx]
     else:
         raise ValueError("shard_dim:{} is not supported.".format(shard_dim))
+    loaded_weight = (
+        loaded_weight.astype(np.float16) 
+        if (str(loaded_weight.dtype) == 'bfloat16' and is_310p()) 
+        else loaded_weight
+    )
     return loaded_weight
 
 
@@ -79,16 +86,30 @@ def safetensors_weights_iterator(
     ):
         with safe_open(st_file, framework="np") as f:
             for name in f.keys():  # noqa: SIM118
+<<<<<<< HEAD
                 # TODO： use slice
                 x = f.get_tensor(name)
                 x = x.astype(np.float16) \
                     if (str(x.dtype) == 'bfloat16' and atlas_inference()) else x
                 yield name, ms.tensor(x)
+=======
+                # Return a lightweight PySafeSlice object that uses file
+                # pointer offset internally to read Safetensor on demand,
+                # avoiding memory explosion. Actual data can be obtained
+                # through slicing operation like param[start:end]
+                param = f.get_slice(name)
+                yield name, param
+>>>>>>> 8858529 (fix new branch)
 
 
 def default_weight_loader(param: Parameter, loaded_weight: Any) -> None:
     """Default weight loader."""
     loaded_weight = loaded_weight[:]
+    loaded_weight = (
+        loaded_weight.astype(np.float16) 
+        if (str(loaded_weight.dtype) == 'bfloat16' and is_310p()) 
+        else loaded_weight
+    )
     param.set_data(ms.Tensor(loaded_weight, dtype=param.dtype))
 
 
-- 
Gitee


From 3f4b5d6255f08cec22eefd50909a02beb63ecb6d Mon Sep 17 00:00:00 2001
From: HighCloud <highcloud333@outlook.com>
Date: Tue, 22 Jul 2025 14:36:53 +0800
Subject: [PATCH 06/14] change atlas_inference to is_310p

---
 vllm_mindspore/distributed/parallel_state.py  |   5 +-
 .../model_loader/weight_utils.py              |  10 +-
 .../models/mf_models/mf_model_base.py         | 217 -----------
 .../models/mf_models/weight_processor.py      | 342 ++++++++++++++++++
 .../model_executor/models/model_base.py       |  13 +-
 vllm_mindspore/model_executor/models/qwen2.py |   6 +-
 vllm_mindspore/utils.py                       |   2 +
 vllm_mindspore/v1/worker/gpu_model_runner.py  |   4 +-
 vllm_mindspore/worker/cache_engine.py         |   7 +-
 vllm_mindspore/worker/model_runner.py         |   6 +-
 10 files changed, 367 insertions(+), 245 deletions(-)
 delete mode 100644 vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/weight_processor.py

diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py
index 697196fa..a3ef9fd8 100644
--- a/vllm_mindspore/distributed/parallel_state.py
+++ b/vllm_mindspore/distributed/parallel_state.py
@@ -4,7 +4,8 @@ from torch.distributed import ProcessGroup
 
 from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union)
 from vllm.distributed.parallel_state import _split_tensor_dict, TensorMetadata
-from vllm_mindspore.utils import atlas_inference
+from vllm_mindspore.utils import is_310p
+
 
 def gc_broadcast_tensor_dict(
         self,
@@ -20,7 +21,7 @@ def gc_broadcast_tensor_dict(
         if (not torch.distributed.is_initialized() or self.world_size == 1):
             return tensor_dict
 
-        if not atlas_inference():
+        if not is_310p():
             group = self.device_group
         metadata_group = self.cpu_group
         assert src < self.world_size, f"Invalid src rank ({src})"
diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py
index 4a0fdcf8..3642f234 100644
--- a/vllm_mindspore/model_executor/model_loader/weight_utils.py
+++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py
@@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT,
 from vllm_mindspore.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm_mindspore.platforms.ascend import ModelConfig
-from vllm_mindspore.utils import atlas_inference
+from vllm_mindspore.utils import is_310p
 from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT,
                                                            enable_tqdm)
 
@@ -86,20 +86,12 @@ def safetensors_weights_iterator(
     ):
         with safe_open(st_file, framework="np") as f:
             for name in f.keys():  # noqa: SIM118
-<<<<<<< HEAD
-                # TODO： use slice
-                x = f.get_tensor(name)
-                x = x.astype(np.float16) \
-                    if (str(x.dtype) == 'bfloat16' and atlas_inference()) else x
-                yield name, ms.tensor(x)
-=======
                 # Return a lightweight PySafeSlice object that uses file
                 # pointer offset internally to read Safetensor on demand,
                 # avoiding memory explosion. Actual data can be obtained
                 # through slicing operation like param[start:end]
                 param = f.get_slice(name)
                 yield name, param
->>>>>>> 8858529 (fix new branch)
 
 
 def default_weight_loader(param: Parameter, loaded_weight: Any) -> None:
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
deleted file mode 100644
index 20969730..00000000
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Copyright 2025 Huawei Technologies Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from abc import abstractmethod
-from collections.abc import Iterable
-from typing import Optional, Union
-
-import mindspore as ms
-from mindformers.core.context import build_mf_context
-from mindformers.core.parallel_config import build_parallel_config
-from mindformers.tools.register.config import MindFormerConfig
-from mindformers.tools.utils import is_pynative
-from mindspore import Tensor, nn
-from mindspore.common.api import _pynative_executor
-from mindspore.communication import get_rank
-from vllm.config import VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.distributed.parallel_state import get_dp_group
-from vllm.forward_context import get_forward_context
-from vllm.logger import init_logger
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-
-from vllm_mindspore.model_executor.models.attention_mask import (
-    LowerTriangularMask)
-from vllm_mindspore.model_executor.models.model_base import MsModelBase
-from vllm_mindspore.model_executor.models.utils import is_use_ringmla
-
-try:
-    # Need to apply dllm pd patch on vllm to use pd disagg related functions
-    from vllm.attention.layer import (maybe_save_kv_layer_to_connector,
-                                      wait_for_kv_layer_from_connector)
-    from vllm.distributed.kv_transfer import is_v1_kv_transfer_group
-    kv_transfer_supported = True
-except:  # noqa: E722
-    kv_transfer_supported = False
-
-logger = init_logger(__name__)
-
-
-class MfModelBase(MsModelBase):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        model_config_path = os.getenv("MINDFORMERS_MODEL_CONFIG")
-        if model_config_path is None:
-            raise RuntimeError('For "MindFormers" model backend, environments '
-                               'MINDFORMERS_MODEL_CONFIG should be set!')
-
-        self.mf_config = MindFormerConfig(model_config_path)
-        self.rank_id = get_rank()
-        self.dp_size = get_dp_group()
-
-        self.kv_transfer_config = vllm_config.kv_transfer_config
-        build_mf_context(self.mf_config)
-        build_parallel_config(self.mf_config)
-        self.mf_config.model.model_config.parallel_config = (
-            self.mf_config.parallel_config)
-        self.mf_config.model.model_config.parallel_config.model_parallel = (
-            get_tensor_model_parallel_world_size())
-        self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
-        self.use_ringmla = is_use_ringmla(vllm_config, self.mf_config)
-        self.is_chunked = False
-        self._generate_model_config()
-        if not hasattr(self, 'mf_model_config'):
-            raise RuntimeError('mf_model_config not initialized')
-        self.casual_mask = LowerTriangularMask(
-            dtype=self.mf_model_config.compute_dtype,
-            max_model_len=self.model_config.max_model_len)
-        self.network, self.lm_head = self._create_network()
-
-        affinity_config = self.mf_config.get('context',
-                                             {}).get('affinity_cpu_list', {})
-        if isinstance(affinity_config, dict):
-            ms.runtime.set_cpu_affinity(True, affinity_config)
-
-        self._set_dynamic_inputs()
-
-    @property
-    def ready_lm_head(self) -> nn.Cell:
-        if self.lm_head is None:
-            raise RuntimeError("lm_head not initialized")
-        return self.lm_head
-
-    @abstractmethod
-    def _generate_model_config(self):
-        raise NotImplementedError(
-            "Function _generate_model_config should be Implemented!")
-
-    @abstractmethod
-    def _create_network(self):
-        raise NotImplementedError(
-            "Function _create_network should be Implemented!")
-
-    # DLLM
-    def is_decoder_task(self) -> bool:
-        if self.kv_transfer_config is None:
-            return False
-
-        return self.kv_transfer_config.is_kv_consumer
-
-    # DLLM
-    def is_prefill_task(self) -> bool:
-        if self.kv_transfer_config is None:
-            return False
-
-        return self.kv_transfer_config.is_kv_producer
-
-    def _set_dynamic_inputs(self):
-        self.network.set_dynamic_inputs()
-        if not hasattr(self, 'mf_model_config'):
-            raise RuntimeError('mf_model_config not initialized')
-        dynamic_hidden_states = Tensor(
-            shape=[None, None], dtype=self.mf_model_config.compute_dtype)
-        self.ready_lm_head.set_inputs(dynamic_hidden_states)
-
-    def prepare_inputs(self, input_ids, positions):
-        return self.prepare_base_inputs(input_ids, positions)
-
-    def update_model_inputs(self, model_inputs, **kwargs):
-        return model_inputs
-
-    # DLLM
-    def connector_send_kvcache(self):
-        logger.debug("reached connector_send_kvcache")
-        _pynative_executor.sync()
-        forward_context = get_forward_context()
-        if not hasattr(self, 'mf_model_config'):
-            raise RuntimeError('mf_model_config not initialized')
-        for i in range(self.mf_model_config.num_layers):
-            kv_cache = self.kv_caches[i]
-            k_cache = kv_cache.kv_cache[forward_context.virtual_engine][0]
-            v_cache = kv_cache.kv_cache[forward_context.virtual_engine][1]
-            maybe_save_kv_layer_to_connector(str(i), (k_cache, v_cache))
-
-    # DLLM
-    def connector_wait_for_kv_layer(self):
-        logger.debug("reached connector_wait_for_kv_layer")
-        if not hasattr(self, 'mf_model_config'):
-            raise RuntimeError('mf_model_config not initialized')
-        for i in range(self.mf_model_config.num_layers):
-            wait_for_kv_layer_from_connector("key." + str(i))
-
-    def forward(self,
-                input_ids: Tensor,
-                positions: Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[Tensor] = None,
-                **kwargs) -> Union[Tensor, IntermediateTensors]:
-        model_inputs, is_prefill = self.prepare_inputs(input_ids, positions)
-        model_inputs = self.update_model_inputs(model_inputs, **kwargs)
-
-        if is_prefill:
-            self.network.phase = "prefill"
-            if not self.set_flags or is_pynative():
-                self.network.add_flags_custom(is_first_iteration=True)
-            hidden_states = self.network(**model_inputs)
-            self.network.phase = "increment"
-            if not self.set_flags or is_pynative():
-                self.network.add_flags_custom(is_first_iteration=False)
-                self.set_flags = True
-            if kv_transfer_supported and is_v1_kv_transfer_group():
-                self.connector_send_kvcache()
-        # DLLM
-        else:
-            if kv_transfer_supported:
-                if is_v1_kv_transfer_group() and self.is_prefill_task():
-                    self.connector_send_kvcache()
-
-                if is_v1_kv_transfer_group() and self.is_decoder_task():
-                    self.connector_wait_for_kv_layer()
-                    logger.debug("connector_wait_for_kv_layer success")
-            hidden_states = self.network(**model_inputs)
-
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[Tensor]:
-        if sampling_metadata is not None:
-            selected_token_indices = sampling_metadata.selected_token_indices
-            if (selected_token_indices is not None
-                    and selected_token_indices.numel() <= 0):
-                if not hasattr(self, 'mf_model_config'):
-                    raise RuntimeError('mf_model_config not initialized')
-                logits = ms.mint.zeros(
-                    (0, self.mf_model_config.vocab_size),
-                    dtype=self.mf_model_config.compute_dtype)
-            else:
-                hidden_states = hidden_states.index_select(
-                    0, selected_token_indices)
-                logits = self.ready_lm_head(hidden_states)
-                logits = logits.view(-1, logits.shape[-1])
-        else:
-            logits = self.ready_lm_head(hidden_states)
-            logits = logits.view(-1, logits.shape[-1])
-        return logits
-
-    def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> set[str]:
-        raise NotImplementedError("load_weight not implemented.")
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
new file mode 100644
index 00000000..5036323c
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -0,0 +1,342 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2025 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+transform huggingface safetensor.
+"""
+
+import os
+from enum import Enum
+
+from mindformers.parallel_core.inference import parallel_state as ps
+from mindspore.communication.management import get_group_size, get_rank
+from safetensors import safe_open
+from vllm_mindspore.utils import is_310p
+
+class EPMethod(Enum):
+    """
+    EP method enums
+    """
+    DEFAULT = 'default'
+    ALLTOALL = 'alltoall'
+    ALLGATHER = 'allgather'
+
+
+class BaseWeightProcessor:
+    r"""
+    Provide model weight load and shards.
+    Args:
+        config (MF Config): The config of Infer model.
+        network (InferenceModelForCausalLM): The network of infer model.
+
+    """
+
+    def __init__(self, config, network, is_quant, vllm_config):
+        self.vllm_config = vllm_config
+        self.is_310p = is_310p()
+        self.config = config
+        self.network = network
+        self.is_quant = is_quant
+        self.global_rank_id = get_rank()
+        self.global_group_size = get_group_size()
+        self.tp_group_size = ps.get_tensor_model_parallel_world_size()
+        self.dp_group_size = ps.get_data_parallel_world_size()
+        self.num_router_experts = self.config.moe_config.expert_num if \
+                self.config.moe_config.expert_num else 1
+        self.moe_ep_size = ps.get_moe_expert_parallel_world_size()
+        self.moe_tp_size = ps.get_moe_tensor_parallel_world_size()
+        self.tp_dp_size = ps.get_tensor_and_data_parallel_world_size()
+        self.ep_method = EPMethod.DEFAULT
+        if self.dp_group_size > 1\
+                and self.moe_ep_size == self.global_group_size:
+            self.ep_method = EPMethod.ALLTOALL
+        elif self.dp_group_size > 1:
+            self.ep_method = EPMethod.ALLGATHER
+        self.tp_rank_id = ps.get_tensor_model_parallel_rank()
+        self.tp_dp_rank_id = ps.get_tensor_and_data_parallel_rank()
+
+        self.ep_group_nums = self.num_router_experts // self.moe_ep_size
+        self.moe_ep_rank_id = ps.get_moe_expert_parallel_rank()
+        self.moe_tp_rank_id = ps.get_moe_tensor_parallel_rank()
+        self.ep_start = self.moe_ep_rank_id * self.ep_group_nums
+        self.ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums
+
+        self.parameter_dict = {}
+        self.file_handles = {}
+
+    def get_file_handles(self, filename):
+        if filename not in self.file_handles:
+            fp = safe_open(filename, framework="np")
+            self.file_handles[filename] = fp
+        return self.file_handles[filename]
+
+    def release_file_handles(self):
+        del self.file_handles
+
+    def get_safetensor_from_file(self, hf_param_name, src_hf_dir,
+                                 hf_weight_map):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata(
+        ) is not None and hf_param_name in sf_file.metadata():
+            qint4 = True
+
+        np_data = sf_file.get_tensor(hf_param_name)
+        return np_data, qint4
+
+    def get_safetensor_from_file_split_tp_group(self,
+                                                hf_param_name,
+                                                src_hf_dir,
+                                                hf_weight_map,
+                                                split_axis=0):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata(
+        ) is not None and hf_param_name in sf_file.metadata():
+            qint4 = True
+
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        elif split_axis == 2:
+            split_size = shape[2] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = np_data[:, :, start:stop]
+        else:
+            raise ValueError(
+                "split_axis:{} is not supported.".format(split_axis))
+        return split_data, qint4
+
+    def get_safetensor_from_file_split_tpdp_group(self,
+                                                  hf_param_name,
+                                                  src_hf_dir,
+                                                  hf_weight_map,
+                                                  split_axis=0):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata(
+        ) is not None and hf_param_name in sf_file.metadata():
+            qint4 = True
+
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.tp_dp_size
+            start = self.tp_dp_rank_id * split_size
+            stop = (self.tp_dp_rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.tp_dp_size
+            start = self.tp_dp_rank_id * split_size
+            stop = (self.tp_dp_rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        elif split_axis == 2:
+            split_size = shape[2] // self.tp_dp_size
+            start = self.tp_dp_rank_id * split_size
+            stop = (self.tp_dp_rank_id + 1) * split_size
+            split_data = np_data[:, :, start:stop]
+        else:
+            raise ValueError(
+                "split_axis:{} is not supported.".format(split_axis))
+        return split_data, qint4
+
+    def get_safetensor_from_file_split_global_group(self,
+                                                    hf_param_name,
+                                                    src_hf_dir,
+                                                    hf_weight_map,
+                                                    split_axis=0):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata(
+        ) is not None and hf_param_name in sf_file.metadata():
+            qint4 = True
+
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.global_group_size
+            start = self.global_rank_id * split_size
+            stop = (self.global_rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.global_group_size
+            start = self.global_rank_id * split_size
+            stop = (self.global_rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        elif split_axis == 2:
+            split_size = shape[2] // self.global_group_size
+            start = self.global_rank_id * split_size
+            stop = (self.global_rank_id + 1) * split_size
+            split_data = np_data[:, :, start:stop]
+        else:
+            raise ValueError(
+                "split_axis:{} is not supported.".format(split_axis))
+
+        return split_data, qint4
+
+    def get_safetensor_from_file_split_moe_tp_group(self,
+                                                    hf_param_name,
+                                                    src_hf_dir,
+                                                    hf_weight_map,
+                                                    split_axis=0):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata(
+        ) is not None and hf_param_name in sf_file.metadata():
+            qint4 = True
+
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        else:
+            raise ValueError(
+                "split_axis:{} is not supported.".format(split_axis))
+
+        return split_data, qint4
+
+    def get_routed_safetensor_3_dim(self,
+                                    hf_param_name,
+                                    src_hf_dir,
+                                    hf_weight_map,
+                                    split_ep=False,
+                                    split_tp=False,
+                                    tp_axis=-1):
+        '''get_routed_safetensor_3_dim'''
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata(
+        ) is not None and hf_param_name in sf_file.metadata():
+            qint4 = True
+        if not split_tp and not split_ep:
+            np_data = sf_file.get_tensor(hf_param_name)
+            return np_data, qint4
+
+        np_data = sf_file.get_slice(hf_param_name)
+        if not split_tp and split_ep:
+            split_data = np_data[self.ep_start:self.ep_stop, :, :]
+            return split_data, qint4
+
+        shape = np_data.get_shape()
+        if tp_axis == 1:
+            split_size = shape[1] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[
+                self.ep_start:self.ep_stop,
+                start:stop, :] if split_ep else np_data[:, start:stop, :]
+        elif tp_axis == 2:
+            split_size = shape[2] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[
+                self.ep_start:self.ep_stop, :,
+                start:stop] if split_ep else np_data[:, :, start:stop]
+        else:
+            raise ValueError("tp_axis:{} is not supported.".format(tp_axis))
+        return split_data, qint4
+
+    def get_routed_safetensor_2_dim(self,
+                                    hf_param_name,
+                                    src_hf_dir,
+                                    hf_weight_map,
+                                    split_ep=False,
+                                    split_tp=False,
+                                    tp_axis=-1):
+        '''get_moe_routed_safetensor_2_dim'''
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata(
+        ) is not None and hf_param_name in sf_file.metadata():
+            qint4 = True
+        if not split_tp and not split_ep:
+            np_data = sf_file.get_tensor(hf_param_name)
+            return np_data, qint4
+
+        np_data = sf_file.get_slice(hf_param_name)
+        if not split_tp and split_ep:
+            split_data = np_data[self.ep_start:self.ep_stop, :]
+            return split_data, qint4
+
+        shape = np_data.get_shape()
+        if tp_axis == 1:
+            split_size = shape[1] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[
+                self.ep_start:self.ep_stop,
+                start:stop] if split_ep else np_data[:, start:stop]
+        else:
+            raise ValueError(
+                "split_tp is True but tp_axis:{} is not supported.".format(
+                    tp_axis))
+        return split_data, qint4
+
+    def split_weight_by_rank(self, weight, split_axis=0):
+        if self.tp_group_size == 1:
+            return weight
+
+        shape = weight.shape
+        if split_axis == 0:
+            split_size = shape[0] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = weight[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = weight[:, start:stop]
+        else:
+            raise ValueError(
+                "split_axis:{} is not supported.".format(split_axis))
+        return split_data
+
+    def load_safetensors_shard(self, src_hf_dir):
+        """ load safetensors and shards """
+        raise NotImplementedError(
+            "load_safetensors_shard method is not implemented.")
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 5fc07fd2..dd28a8c6 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -34,9 +34,9 @@ from vllm_mindspore.model_executor.models.attention_mask import (
     LowerTriangularMask)
 from vllm_mindspore.model_executor.models.utils import is_use_ringmla
 from vllm_mindspore.model_executor.utils import set_model_context
-from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE, create_kv_cache
+from vllm_mindspore.utils import FORMAT_TYPE, STR_DTYPE_TO_MS_DTYPE, is_310p, create_kv_cache
+
 from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata
-from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE
 
 class AttentionWrapper:
 
@@ -48,7 +48,7 @@ class AttentionWrapper:
         head_size = vllm_config.model_config.get_head_size()
         num_block = 0
 
-        if atlas_inference():
+        if is_310p():
             self.kv_shape = [num_block, block_size, num_kv_heads * head_size]
             self.kv_cache = [
                 (
@@ -98,7 +98,7 @@ class MLAAttentionWrapper(AttentionWrapper):
         self.use_mla_op = bool(
             vllm_config.additional_config
             and vllm_config.additional_config.get('use_mla_op') == 1)
-        if atlas_inference():
+        if is_310p():
             self.kv_cache = [
                 (
                     ops.auto_generate.format_cast(
@@ -481,8 +481,9 @@ class NativeModel(MsModelBase):
         block_size = self.cache_config.block_size
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
-        kv_cache_shape = (None, block_size, num_kv_heads * head_size) if atlas_inference() \
-            else (None, block_size, num_kv_heads, head_size)
+        kv_cache_shape = (None, block_size, num_kv_heads * head_size) \
+            if is_310p() else (None, block_size, num_kv_heads,
+                                       head_size)
 
         kv_cache_dtype = (self.model_config.dtype
                           if self.cache_config.cache_dtype == "auto" else
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index b9ddd507..fa4dd43e 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -47,7 +47,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.distributed import get_tensor_model_parallel_rank
 
-from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE
+from vllm_mindspore.utils import is_310p, FORMAT_TYPE
 from vllm_mindspore.attention import Attention
 from vllm_mindspore.model_executor.layers.activation import SiluAndMul
 from vllm_mindspore.model_executor.layers.layernorm import RMSNorm
@@ -422,7 +422,7 @@ class Qwen2Model(nn.Cell):
                     loaded_params.add(name)
 
         def adjust_weight(params_dict):
-            if not atlas_inference():
+            if not is_310p():
                 return
 
             target_keywords = [
@@ -439,7 +439,7 @@ class Qwen2Model(nn.Cell):
                     ms.runtime.synchronize()
                     param.set_data(cast_weight)
 
-        if atlas_inference():
+        if is_310p():
             ms.runtime.synchronize()
             adjust_weight(params_dict)
             ms.runtime.synchronize()
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index 4da8b795..20aa3878 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -312,6 +312,8 @@ def check_ready():
         "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST":
         "FlashAttentionScore,PagedAttention",
     }
+    if atlas_inference():
+        default_env["MS_ENABLE_INTERNAL_BOOST"] = "off"
     env_setup(default_env)
 
     if os.getenv("MS_MEMPOOL_BLOCK_SIZE"):
diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py
index 2ee11fde..56a8a120 100644
--- a/vllm_mindspore/v1/worker/gpu_model_runner.py
+++ b/vllm_mindspore/v1/worker/gpu_model_runner.py
@@ -444,7 +444,7 @@ def _reshape_kv_cache_tensors(
                 kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
                     num_blocks, kv_cache_spec.block_size,
                     kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
-                if atlas_inference():
+                if is_310p():
                     *dims, second_last, last = kv_cache_shape
                     kv_cache_shape = (*dims, second_last * last)
                 try:
@@ -487,7 +487,7 @@ def _reshape_kv_cache_tensors(
                         cache_block_nz = ops.auto_generate.format_cast(
                             cache_block, 29)
                         kv_cache_layer.append(cache_block_nz)
-                    elif atlas_inference():
+                    elif is_310p():
                         from mindspore.common.api import _pynative_executor
                         cache_block_nz = ops.auto_generate.format_cast(cache_block, FORMAT_TYPE['nz'])
                         _pynative_executor.sync()
diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py
index 7675379c..e8c20397 100644
--- a/vllm_mindspore/worker/cache_engine.py
+++ b/vllm_mindspore/worker/cache_engine.py
@@ -26,7 +26,8 @@ from mindspore import mutable, mint, ops
 
 from typing import List
 from vllm.logger import init_logger
-from vllm_mindspore.utils import MsKVCache, get_valid_dtype, atlas_inference, FORMAT_TYPE
+from vllm_mindspore.utils import (MsKVCache, get_valid_dtype, is_310p,
+                                  FORMAT_TYPE)
 
 logger = init_logger(__name__)
 
@@ -34,7 +35,7 @@ logger = init_logger(__name__)
 def create_block(shape, dtype, name=None, device=None):
     from mindspore.common.api import _pynative_executor
     blocks = mint.empty(*shape, dtype=dtype, device=device)
-    if device == "Ascend" and atlas_inference():
+    if device == "Ascend" and is_310p():
         blocks_nz = ops.auto_generate.format_cast(blocks, FORMAT_TYPE['nz'])
         _pynative_executor.sync()
         import gc
@@ -53,7 +54,7 @@ def ms_allocate_kv_cache(
     """Allocates KV cache on the specified device."""
     kv_cache_shape = self.attn_backend.get_kv_cache_shape(
         num_blocks, self.block_size, self.num_kv_heads, self.head_size)
-    if atlas_inference():
+    if is_310p():
         *dims, second_last, last = kv_cache_shape
         kv_cache_shape = (*dims, second_last * last)
     kv_cache: List[MsKVCache] = []
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index 6ab97c1b..1c37be98 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -28,7 +28,7 @@ from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceGroupMetadata
 
-from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE, atlas_inference
+from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE, is_310p
 
 logger = init_logger(__name__)
 
@@ -140,8 +140,8 @@ def _dummy_run(self,
         block_size = self.cache_config.block_size
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
-        kv_shape = [0, block_size, num_kv_heads * head_size] if atlas_inference() else \
-                   [0, block_size, num_kv_heads, head_size]
+        kv_shape = [0, block_size, num_kv_heads * head_size] \
+            if is_310p() else [0, block_size, num_kv_heads, head_size]
         kv_caches = mutable([
             mutable(
                 (
-- 
Gitee


From 51a9dd20459b45830f4a408fa6caa9b12340ebc1 Mon Sep 17 00:00:00 2001
From: luolihao <luolihao2@huawei.com>
Date: Thu, 24 Jul 2025 19:08:26 +0800
Subject: [PATCH 07/14] support qwq w8a8sc

---
 .../layers/quantization/__init__.py           |   9 +-
 .../quantization/sparse_quant_modelslim.py    | 182 ++++++++++++++++++
 vllm_mindspore/model_executor/models/qwen2.py |  50 ++++-
 3 files changed, 238 insertions(+), 3 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py

diff --git a/vllm_mindspore/model_executor/layers/quantization/__init__.py b/vllm_mindspore/model_executor/layers/quantization/__init__.py
index 6c9e2e41..3c6c2da9 100644
--- a/vllm_mindspore/model_executor/layers/quantization/__init__.py
+++ b/vllm_mindspore/model_executor/layers/quantization/__init__.py
@@ -21,7 +21,10 @@ from typing import Literal, get_args
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 
-QuantizationMethods = Literal["smoothquant"]
+QuantizationMethods = Literal[
+    "smoothquant",
+    "sparsequant"
+]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
 # The customized quantization methods which will be added to this dict.
@@ -34,8 +37,10 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
 
     # lazy import to avoid triggering `torch.compile` too early
     from .smooth_quant_modelslim import SmoothQuantModelSlimConfig
+    from .sparse_quant_modelslim import SparseQuantModelSlimConfig
     method_to_config: dict[str, type[QuantizationConfig]] = {
-        "smoothquant": SmoothQuantModelSlimConfig
+        "smoothquant": SmoothQuantModelSlimConfig,
+        "sparsequant": SparseQuantModelSlimConfig
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py b/vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py
new file mode 100644
index 00000000..f6ede5ed
--- /dev/null
+++ b/vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+from typing import Any, Optional, Dict
+
+import torch
+import numpy as np
+import mindspore
+
+from mindspore.common.initializer import initializer
+from mindspore import Parameter, ops, Tensor
+from mindspore.ops.operations._infer_ops import QuantV2
+from mindspore.communication import get_rank
+from vllm_mindspore.model_executor.layers.linear import LinearMethodBase, UnquantizedLinearMethod, LinearBase
+
+from .base_config import QuantizationConfig
+
+
+
+class SparseQuantModelSlimConfig(QuantizationConfig):
+    '''Config class for SparseQuant.'''
+
+    def __init__(
+        self,
+        full_config: Dict[str, Any],
+        weight_bits: Optional[int] = 8,
+        group_size: Optional[int] = 1,
+        zero_point: Optional[bool] = True,
+        dynamic_quant: Optional[bool] = False,
+        kv_cache_bits: Optional[int] = 16,
+        modules_to_not_convert: Optional[list[str]] = None,
+    ) -> None:
+        super().__init__()
+        self.full_config = full_config
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.dynamic_quant = dynamic_quant
+        self.kv_cache_bits = kv_cache_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
+
+        if self.weight_bits != 8:
+            raise ValueError(
+                "Currently, only 8-bit weight quantization is supported for "
+                f"A8W8SC, but got {self.weight_bits} bits.")
+        self.pack_factor = 8 // self.weight_bits
+
+    def __repr__(self) -> str:
+        return (f"SparseConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"zero_point={self.zero_point}, "
+                f"modules_to_not_convert={self.modules_to_not_convert})")
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return [
+            "quant_model_description.json"
+        ]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """Minimum GPU capability to support the quantization method.
+
+        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
+        This requirement is due to the custom CUDA kernels used by the
+        quantization method.
+        """
+        return -1
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "SparseQuantModelSlimConfig":
+        return cls(config)
+
+    def get_name(self) -> str:
+        return "SparseQuant"
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.int8, torch.float16, torch.bfloat16]
+
+    def get_quant_method(self, layer: mindspore.nn.Cell,
+                         prefix: str) -> "QuantizeMethodBase":
+
+        rank_id = get_rank()
+        sparse_quant_description = self.full_config[f'rank_{rank_id}']
+        if isinstance(layer, LinearBase) and sparse_quant_description[f"{prefix}.weight"].lower() == "w8a8s":
+            compress_weight_size = sparse_quant_description[f"{prefix}.weight.shape"]
+            compress_index_size = sparse_quant_description[f"{prefix}.index.shape"]
+
+            return A8W8SCLinearMethod(self, compress_weight_size[0], compress_index_size[0])
+
+        return UnquantizedLinearMethod()
+
+
+class A8W8SCLinearMethod(LinearMethodBase):
+    '''Linear method for A8W8SCLinearMethod.'''
+
+    def __init__(self, quant_config: SparseQuantModelSlimConfig, compress_weight_size=None, compress_index_size=None):
+        self.quant_config = quant_config
+        self.compress_weight_size = compress_weight_size
+        self.compress_index_size = compress_index_size
+
+        self.quant = QuantV2()
+        self.linear_sparse = ops.auto_generate.QuantLinearSparse()
+
+    def create_weights(self,
+            layer: mindspore.nn.Cell,
+            input_size_per_partition: int,
+            output_partition_sizes: list[int],
+            input_size: int,
+            output_size: int,
+            params_dtype,
+            is_group_mm=False,
+            expert_num_per_partition=1,
+            **extra_weight_attrs):
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
+        self.output_size_per_partition = output_size_per_partition
+        self.input_size_per_partition = input_size_per_partition
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        weight = Parameter(initializer('normal', (self.compress_weight_size), mindspore.int8), name="weight")
+        index = Parameter(initializer('normal', (self.compress_index_size), mindspore.int8), name="index")
+        deq_scale = Parameter(initializer('normal', (self.output_size_per_partition), mindspore.int64),
+                              name="deq_scale")
+        quant_bias = Parameter(initializer('zeros', (self.output_size_per_partition), mindspore.int32),
+                               name="quant_bias")
+        input_scale = Parameter(Tensor(np.ones(self.input_size_per_partition), mindspore.float16),
+                                name="input_scale")
+        input_offset = Parameter(Tensor(np.zeros(self.input_size_per_partition), mindspore.int8),
+                                 name="input_offset")
+
+        layer.insert_param_to_cell("weight", weight)
+        layer.insert_param_to_cell("index", index)
+        layer.insert_param_to_cell("deq_scale", deq_scale)
+        layer.insert_param_to_cell("quant_bias", quant_bias)
+        layer.insert_param_to_cell("input_scale", input_scale)
+        layer.insert_param_to_cell("input_offset", input_offset)
+
+    def apply(self,
+              layer: mindspore.nn.Cell,
+              x: mindspore.Tensor,
+              bias: mindspore.Parameter = None, group_list=None, cumsum_flag=False) -> mindspore.Tensor:
+        weight = layer.weight
+        index = layer.index
+        deq_scale = layer.deq_scale
+        quant_bias = layer.quant_bias
+        input_scale = layer.input_scale
+        input_offset = layer.input_offset
+
+        output_shape = x.shape[:-1] + (self.output_size_per_partition,)
+        x = x.reshape(-1, self.input_size_per_partition)
+
+        x = self.quant(x, input_scale, input_offset, False, "ROUND", mindspore.int8)
+        x = self.linear_sparse(x, weight, deq_scale, index, quant_bias)
+
+        x = x.reshape(output_shape)
+
+        return x
\ No newline at end of file
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index fa4dd43e..eff9be31 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -64,6 +64,7 @@ from vllm_mindspore.model_executor.models.model_base import (NativeModel)
 from vllm_mindspore.model_executor.models.utils import (
     PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers,
     maybe_prefix)
+from mindspore.communication.management import get_rank
 
 
 class Qwen2MLP(nn.Cell):
@@ -365,6 +366,50 @@ class Qwen2Model(nn.Cell):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_split_weights(self, weights: Iterable[tuple[str, Tensor]],
+                           params_dict: dict[str, Parameter]):
+        weights_dict = dict(weights)
+
+        for name, loaded_weight in weights_dict.items():
+            if get_tensor_model_parallel_rank(
+            ) > 0 and "o_proj.quant_bias" in name:
+                continue
+
+            if name not in params_dict:
+                continue
+
+            param = params_dict[name]
+            param.set_data(loaded_weight.contiguous())
+
+        def adjust_weight(params_dict):
+            if not is_310p():
+                return
+
+            target_keywords = [
+                "qkv_proj.weight",
+                "o_proj.weight",
+                "gate_up_proj.weight",
+                "down_proj.weight",
+                # "lm_head.weight",
+            ]
+
+            rank_id = get_rank()
+            for name, param in params_dict.items():
+                if any(name.endswith(keyword) for keyword in target_keywords):
+                    weight_type = self.quant_config.full_config[f"rank_{rank_id}"][name]
+                    if weight_type.lower() == "w8a8s":
+                        # 压缩后权重不需要转Nz
+                        continue
+
+                    cast_weight = ops.auto_generate.format_cast(param, FORMAT_TYPE['nz'])
+                    ms.runtime.synchronize()
+                    param.set_data(cast_weight)
+
+        if is_310p():
+            ms.runtime.synchronize()
+            adjust_weight(params_dict)
+            ms.runtime.synchronize()
+
     def load_weights(self, weights: Iterable[tuple[str, Tensor]],
                      params_dict: dict[str, Parameter]):
         loaded_params: set[str] = set()
@@ -514,7 +559,10 @@ class Qwen2ForCausalLM(NativeModel, SupportsLoRA):
 
     def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> set[str]:
         params_dict = self.get_params_dict()
-        self.model.load_weights(weights, params_dict)
+        if self.vllm_config.model_config.quantization == "sparsequant":
+            self.model.load_split_weights(weights, params_dict)
+        else:
+            self.model.load_weights(weights, params_dict)
 
     def compute_logits(
         self,
-- 
Gitee


From 93fb64554ecd426c8b2b8a81d4245f4140e661a6 Mon Sep 17 00:00:00 2001
From: huangzhuo <huangzhuo17@huawei.com>
Date: Sat, 23 Aug 2025 09:34:11 +0800
Subject: [PATCH 08/14] graph mode support mutilora

---
 vllm_mindspore/lora/layers.py                 | 231 +++++------
 vllm_mindspore/lora/models.py                 |   7 +-
 .../lora/punica_wrapper/punica_npu.py         | 371 ++++++------------
 vllm_mindspore/lora/utils.py                  |  24 +-
 .../model_executor/model_loader/utils.py      |  31 +-
 5 files changed, 239 insertions(+), 425 deletions(-)

diff --git a/vllm_mindspore/lora/layers.py b/vllm_mindspore/lora/layers.py
index d3f4b367..7d565138 100644
--- a/vllm_mindspore/lora/layers.py
+++ b/vllm_mindspore/lora/layers.py
@@ -24,16 +24,18 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional, Union, cast
 
 import mindspore as ms
-from mindspore import mint
+from mindspore import Parameter, ops, mint
+from mindspore.common.initializer import initializer
+import torch.nn.functional as F
 from transformers import PretrainedConfig
 from vllm.adapter_commons.layers import AdapterMapping
 from vllm.config import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
+                              tensor_model_parallel_all_gather)
 from vllm.distributed.utils import divide
+# yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
     LinearScalingRotaryEmbedding, RotaryEmbedding)
@@ -320,49 +322,26 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
                                    self.output_size, self.tp_size))
         else:
             raise NotImplementedError
-
-        self.lora_a_stacked = tuple(
-            mint.zeros(
-                (
-                    max_loras,
-                    1,
-                    lora_a_out_size,
-                    self.input_size,
-                ),
-                dtype=lora_config.lora_dtype,
-            ) for _ in range(self.n_slices))
-        self.lora_b_stacked = tuple(
-            mint.zeros(
-                (
-                    max_loras,
-                    1,
-                    lora_b_out_size,
-                    lora_config.max_lora_rank,
-                ),
-                dtype=lora_config.lora_dtype,
-            ) for _ in range(self.n_slices))
+        self.lora_a_stacked = Parameter(
+            initializer('zeros', (max_loras, self.input_size, lora_a_out_size),
+                        lora_config.lora_dtype))
+        self.lora_b_stacked = Parameter(
+            initializer('zeros', (max_loras, lora_a_out_size, lora_b_out_size),
+                        lora_config.lora_dtype))
         if lora_config.bias_enabled:
             lora_bias_out_size = lora_b_out_size
-            self.lora_bias_stacked = tuple(
-                mint.zeros(
-                    (
-                        max_loras,
-                        1,
-                        lora_bias_out_size,
-                    ),
-                    dtype=lora_config.lora_dtype,
-                ) for _ in range(self.n_slices))
-        self.output_slices = (self.lora_b_stacked[0].shape[2], )
+            self.lora_bias_stacked = Parameter(
+                initializer('zeros', (max_loras, lora_bias_out_size),
+                            lora_config.lora_dtype))
+        else:
+            self.lora_bias_stacked = None
 
     def reset_lora(self, index: int):
-        for s_index in range(self.n_slices):
-            self.lora_a_stacked[s_index][index] = 0
-            self.lora_b_stacked[s_index][index] = 0
-            if self.lora_config.bias_enabled:
-                # Make mypy happy
-                self.lora_bias_stacked = cast(tuple[ms.Tensor, ...],
-                                              self.lora_bias_stacked)
-                self.lora_bias_stacked[s_index][index] = 0
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+        if self.lora_bias_stacked:
+            # Make mypy happy
+            self.lora_bias_stacked[index] = 0
 
     def set_lora(
         self,
@@ -376,8 +355,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
         # store weights in a tuple of size 1. These two layers will
         # override this function.
-        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
-                self.n_slices == 1)
 
         self.reset_lora(index)
         if self.tp_size > 1:
@@ -385,29 +362,44 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
             lora_b = self.slice_lora_b(lora_b)
             if lora_bias is not None:
                 lora_bias = self.slice_bias(lora_bias)
-
-        self.lora_a_stacked[0][index,
-                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                   lora_a.T, non_blocking=True)
-        self.lora_b_stacked[0][index,
-                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                   lora_b.T, non_blocking=True)
-        if lora_bias is not None:
-
-            self.lora_bias_stacked = cast(tuple[ms.Tensor, ...],
-                                          self.lora_bias_stacked)
-            assert len(self.lora_bias_stacked)
-            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
-                lora_bias.T, non_blocking=True)
+        if self.n_slices == 3:
+            self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0]
+            self.lora_a_stacked[index, :, lora_a[0].shape[1] : lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1]
+            self.lora_a_stacked[index, :, lora_a[0].shape[1] + lora_a[1].shape[1] :] = lora_a[2]
+            self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0]
+            self.lora_b_stacked[index, lora_b[0].shape[0] : lora_b[0].shape[0] + lora_b[1].shape[0], lora_b[0].shape[1] : lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1]
+            self.lora_b_stacked[index, lora_b[0].shape[0] + lora_b[1].shape[0] :, lora_b[0].shape[1] + lora_b[1].shape[1] :] = lora_b[2]
+            if self.lora_bias_stacked is not None:
+                assert len(self.lora_bias_stacked)
+                lora_bias = ops.concat(lora_bias, axis=0)
+                self.lora_bias_stacked[index] = lora_bias
+        elif self.n_slices == 2:
+            self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0]
+            self.lora_a_stacked[index, :, lora_a[0].shape[1]: lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1]
+            self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0]
+            self.lora_b_stacked[index, lora_b[0].shape[0]: lora_b[0].shape[0] + lora_b[1].shape[0],
+            lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1]
+            if self.lora_bias_stacked is not None:
+                assert len(self.lora_bias_stacked)
+                lora_bias = ops.concat(lora_bias, axis=0)
+                self.lora_bias_stacked[index] = lora_bias
+            if self.lora_bias_stacked is not None:
+                assert len(self.lora_bias_stacked)
+                lora_bias = ops.concat(lora_bias, axis=0)
+                self.lora_bias_stacked[index] = lora_bias
+        else:
+            self.lora_a_stacked[index] = lora_a
+            self.lora_b_stacked[index] = lora_b
+            if self.lora_bias_stacked is not None:
+                assert len(self.lora_bias_stacked)
+                self.lora_bias_stacked[index] = lora_bias
 
     def apply(self,
               x: ms.Tensor,
               bias: Optional[ms.Tensor] = None) -> ms.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
-                                            self.lora_b_stacked,
-                                            self.lora_bias_stacked, 1.0,
-                                            self.output_slices)
+        output = self.punica_wrapper(output, x, self.lora_a_stacked, self.lora_b_stacked,
+                                    self.lora_bias_stacked, 1.0)
         return output
 
 
@@ -540,7 +532,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
         """
-        The main reason for overriding this function is to enhance  code 
+        The main reason for overriding this function is to enhance  code
         maintainability.
         """
         self.lora_config = lora_config
@@ -548,36 +540,19 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         lora_a_output_size_per_partition = (
             lora_config.max_lora_rank if not lora_config.fully_sharded_loras
             else divide(lora_config.max_lora_rank, self.tp_size))
-        self.lora_a_stacked = tuple(
-            mint.zeros(
-                (
-                    max_loras,
-                    1,
-                    lora_a_output_size_per_partition,
-                    self.input_size,
-                ),
-                dtype=lora_config.lora_dtype,
-            ) for _ in range(self.n_slices))
-        self.lora_b_stacked = tuple(
-            mint.zeros(
-                (
-                    max_loras,
-                    1,
-                    output_size,
-                    lora_config.max_lora_rank,
-                ),
-                dtype=lora_config.lora_dtype,
-            ) for output_size in self.output_slices)
+        output_size = sum(self.output_slices)
+        self.lora_a_stacked = Parameter(
+            initializer('zeros', (max_loras, self.input_size, lora_a_output_size_per_partition * self.n_slices),
+                        lora_config.lora_dtype))
+        self.lora_b_stacked = Parameter(
+            initializer('zeros', (max_loras, lora_a_output_size_per_partition * self.n_slices, output_size),
+                        lora_config.lora_dtype))
         if lora_config.bias_enabled:
-            self.lora_bias_stacked = tuple(
-                mint.zeros(
-                    (
-                        max_loras,
-                        1,
-                        output_size,
-                    ),
-                    dtype=lora_config.lora_dtype,
-                ) for output_size in self.output_slices)
+            self.lora_bias_stacked = Parameter(
+                initializer('zeros', (max_loras, output_size),
+                            lora_config.lora_dtype))
+        else:
+            self.lora_bias_stacked = None
 
     def slice_lora_a(
             self, lora_a: list[Union[ms.Tensor,
@@ -619,26 +594,20 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             lora_b = self.slice_lora_b(lora_b)
             if lora_bias is not None:
                 lora_bias = self.slice_bias(lora_bias)
-
+        lora_weight_list = []
         for i in range(self.n_slices):
-            if (lora_a_i := lora_a[i]) is not None:
-                self.lora_a_stacked[i][
-                    index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_(
-                        lora_a_i.T, non_blocking=True)
-            if (lora_b_i := lora_b[i]) is not None:
-                self.lora_b_stacked[i][
-                    index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_(
-                        lora_b_i.T, non_blocking=True)
+            if (lora_a_i := lora_a[i]) is not None and (lora_b_i := lora_b[i]) is not None:
+                lora_weight_list.append(ops.matmul(lora_a_i.transpose(1, 0), lora_b_i.transpose(1, 0)))
+        lora_weight = ops.cat(tuple(lora_weight_list), axis=-1)
+        self.lora_weight[index] = lora_weight
 
+        lora_bias_list = []
         if lora_bias is not None:
-            self.lora_bias_stacked = cast(tuple[ms.Tensor, ...],
-                                          self.lora_bias_stacked)
             for i in range(self.n_slices):
                 if (lora_bias_i := lora_bias[i]) is not None:
-                    self.lora_bias_stacked[i][index,
-                                              0, :lora_bias_i.shape[0]].copy_(
-                                                  lora_bias_i.T,
-                                                  non_blocking=True)
+                    lora_bias_list.appendd(lora_bias_i)
+            lora_bias = ops.cat(tuple(lora_weight_list), axis=0)
+            self.lora_bias_stacked[index] =  lora_bias
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -757,18 +726,6 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
             self.kv_shard_id,
         )
 
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        """
-        The main reason for overloading this function is to handle inconsistent 
-        weight dimensions in qkv lora.
-        """
-        super().create_lora_weights(max_loras, lora_config, model_config)
-
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(
@@ -836,7 +793,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         # Matrix multiply.
         output_parallel = self.apply(input_parallel)
         if self.base_layer.reduce_results and self.base_layer.tp_size > 1:
-            output_ = tensor_model_parallel_all_reduce(output_parallel)
+            output_ = self.base_layer.tensor_model_parallel_all_reduce(output_parallel)
         else:
             output_ = output_parallel
 
@@ -1012,24 +969,22 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
             return None
 
         if self.sharded_to_full_mapping_gpu is not None:
-            """
-            Reindex full logits tensor to ensure 1:1 mapping between
-            index and token_id
-            Example for:
-              org_vocab_size = 4
-              added_vocab_size = 2
-              pad_to_size = 8
-              tp_size = 2
-
-            indices:  [0, 1, 2,  3, 4, 5, 6,  7]
-            token_id: [0, 1, 4, -1, 2, 3, 5, -1]
-
-            Therefore, the mapping is expected to be:
-            [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
-            we get:
-            indices:  [0, 1, 2, 3, 4, 5,  6,  7]
-            token_id: [0, 1, 2, 3, 4, 5, -1, -1]
-            """
+            # Reindex full logits tensor to ensure 1:1 mapping between
+            # index and token_id
+            # Example for:
+            #   org_vocab_size = 4
+            #   added_vocab_size = 2
+            #   pad_to_size = 8
+            #   tp_size = 2
+
+            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
+            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
+
+            # Therefore, the mapping is expected to be:
+            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
+            # we get:
+            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
+            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
             logits = logits[:, self.sharded_to_full_mapping_gpu]
 
         lora_logits = mint.empty(
diff --git a/vllm_mindspore/lora/models.py b/vllm_mindspore/lora/models.py
index 621f609a..253242b0 100644
--- a/vllm_mindspore/lora/models.py
+++ b/vllm_mindspore/lora/models.py
@@ -20,6 +20,7 @@
 """Models for Multi-LoRA."""
 
 import os
+import numpy as np
 from typing import Optional, Union
 
 import mindspore as ms
@@ -33,6 +34,7 @@ from vllm.model_executor.models.utils import WeightsMapper
 from vllm.utils import is_pin_memory_available
 
 from vllm_mindspore.lora.layers import BaseLayerWithLoRA
+from vllm_mindspore.utils import is_310p
 
 _GLOBAL_LORA_ID = 0
 
@@ -197,7 +199,10 @@ def from_local_checkpoint(
             check_unexpected_modules(f)
             for module in f.keys():  # noqa
                 # vllm-mindspore add numpy to tensor
-                tensors[module] = mint.Tensor(f.get_tensor(module))
+                np_data = f.get_tensor(module)
+                if is_310p() and str(np_data.dtype) == "bfloat16":
+                    np_data = np_data.astype(np.float32).astype(np.float16)
+                tensors[module] = mint.Tensor(np_data)
     elif os.path.isfile(lora_bin_file_path):
         # When a bin file is provided, we rely on config to find unexpected
         # modules.
diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
index 0a60baf2..d9295f66 100644
--- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py
+++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
@@ -19,215 +19,110 @@
 
 # isort: skip_file
 """Punica wrapper for NPU."""
-from typing import Callable
+from typing import Callable, Optional
 
-from mindspore import mint
+from mindspore import mint, nn, Parameter, ops, dtype
 from mindspore.common import dtype as mstype
+from mindspore.common.initializer import initializer
+from mindspore.ops.auto_generate import grouped_matmul_v4
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 
 from vllm_mindspore.lora.ops.torch_ops.lora_ops import (
     bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand,
-    sgmv_expand_slice, sgmv_shrink)
+    sgmv_expand_slice, sgmv_shrink, sort_lora_by_token_count, einsum_ms)
+from vllm_mindspore.model_executor.utils import get_model_context
 
 
 # The platforms that are compatible with the PyTorch-native implementation can
 # inherit this class
-class PunicaWrapperNPU(PunicaWrapperBase):
+class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
     """
-    PunicaWrapperNPU is designed to manage and provide metadata for the punica 
-    kernel. The main function is to maintain the state information for 
+    PunicaWrapperAtlas is designed to manage and provide metadata for the punica
+    kernel. The main function is to maintain the state information for
     Multi-LoRA, and to provide the interface for the pytorch punica ops.
     """
 
     def __init__(self, max_num_batched_tokens, max_batches, device, **kwargs):
+        nn.Cell.__init__(self)
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
                                    device)
-
-    def _shrink_prefill(
-        self,
-        y,
-        x,
-        w_t_all,
-        scale,
-    ):
-        sgmv_shrink(  # type: ignore
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            scale,
-        )
-
-    def _shrink_decode(
-        self,
-        y,
-        x,
-        w_t_all,
-        scale,
-    ):
-        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
-
-    def _expand_prefill(
-        self,
-        y,
-        x,
-        w_t_all,
-        add_inputs,
-    ):
-        sgmv_expand(  # type: ignore
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            add_inputs,
-        )
-
-    def _expand_decode(
-        self,
-        y,
-        x,
-        w_t_all,
-        add_inputs,
-    ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
-
-    def _expand_slice_prefill(
-        self,
-        y,
-        x,
-        w_t_all,
-        y_offset,
-        y_slice_size,
-        add_inputs,
-    ):
-        sgmv_expand_slice(  # type: ignore
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            y_offset,
-            y_slice_size,
-            add_inputs,
-        )
-
-    def _expand_slice_decode(
-        self,
-        y,
-        x,
-        w_t_all,
-        y_offset,
-        y_slice_size,
-        add_inputs,
-    ):
-        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_inputs)
-
-    def _apply_expand(
-        self,
-        y,
-        x,
-        w_t_all,
-        y_offset,
-        y_slice_size,
-        add_inputs,
-    ):
-        """
-        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
-        computation, which is suitable for the
-        GEMM of lora'b.
-        """
-
-        expand_slice_fun: Callable = (self._expand_slice_prefill
-                                      if self.is_prefill else
-                                      self._expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
-
-    def _apply_shrink(self, y, x, w_t_all, scale):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `_shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the _shrink_decode function
-        should be called.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        shrink_fun: Callable = (self._shrink_prefill
-                                if self.is_prefill else self._shrink_decode)
-        shrink_fun(y, x, w_t_all, scale)
-        y.view_as(y_org)
-
-    def add_shrink(self, y, x, lora_a_stacked, scale, **kwargs):
-        """
-        Performs GEMM  for multiple slices of lora_a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `_shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the _shrink_decode function
-        should be called.
-            
-        Semantics:
-        for i in range(len(lora_a_stacked)):
-            y[i] += (x @ lora_a_stacked[i]) * scale
-        
-        Args:
-            y (Union[Tuple[ms.Tensor, ...], ms.Tensor]): Output tensors
-            x (ms.Tensor): Input tensor
-            lora_a_stacked (Tuple[ms.Tensor, ...]): lora_a's weights
-            scale (float): Scaling factor for the operation
-        """
-
-        x = x.view(-1, x.shape[-1])
-        # TODO fuse these kernels
-        for slice_idx in range(len(lora_a_stacked)):
-            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
-                               scale)
-
-    def add_expand(self,
-                   y,
-                   x,
-                   lora_b_stacked,
-                   lora_bias_stacked,
-                   output_slices,
-                   offset_start=0,
-                   add_inputs=True,
-                   **kwargs) -> None:
-        """
-        Performs GEMM and bias addition for multiple slices of lora_b.
-      
-        Semantics:
-            for i in range(len(lora_b_stacked)):
-                slice = output_slices[i]
-                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
-                    lora_bias_stacked[i] 
-                offset += slice
-            
-        Args:
-            y (ms.Tensor): Output tensor.
-            x (Union[Tuple[ms.Tensor, ...], ms.Tensor]): Input tensors
-            lora_b_stacked (Tuple[ms.Tensor, ...]): lora_b's weight
-            lora_bias_stacked (Optional[Tuple[ms.Tensor, ...]]): 
-                bias's weight
-            output_slices (Tuple[int, ...]): Every slice's size
-            add_inputs (bool):  Defaults to True.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        offset_left = offset_start
-        if lora_bias_stacked is not None:
-            self._apply_bias(self.token_lora_indices, y, output_slices,
-                             lora_bias_stacked)
-        for slice_idx in range(len(lora_b_stacked)):
-            self._apply_expand(
-                y,
-                x[slice_idx],
-                lora_b_stacked[slice_idx],
-                offset_left,
-                output_slices[slice_idx],
-                add_inputs=add_inputs,
-            )
-            offset_left += output_slices[slice_idx]
-        y.view_as(y_org)
+        self.max_loras = kwargs["max_loras"]
+        self.group_list = Parameter(initializer("ones", self.max_loras, dtype.int64), name="group_list")
+        self.lora_indices = Parameter(initializer("ones", self.max_loras, dtype.int64), name="lora_indices")
+
+    def sgmv_shrink(self,
+                    inputs,
+                    lora_a_weights,
+                    group_list,
+                    scaling,
+                    ):
+        outputs = grouped_matmul_v4([inputs], [lora_a_weights],
+                                    group_list=group_list,
+                                    split_item=3,
+                                    group_type=0,
+                                    group_list_type=1)[0]
+        return outputs * scaling
+
+    def bgmv_shrink(self,
+                    inputs,
+                    lora_a_weights,
+                    lora_indices_tensor,
+                    scaling=1.0):
+        selected_loras = lora_a_weights[lora_indices_tensor]
+        inputs = inputs.astype(lora_a_weights[0].dtype)
+        selected_loras = selected_loras.squeeze(1)
+        outputs = einsum_ms(inputs, selected_loras)
+        return scaling * outputs
+
+    def sgmv_expand_slice(self,
+                    inputs,
+                    lora_b_weights,
+                    group_list
+                    ):
+        outputs = grouped_matmul_v4([inputs], [lora_b_weights],
+                                    group_list=group_list,
+                                    split_item=3,
+                                    group_type=0,
+                                    group_list_type=1)[0]
+        return outputs
+
+    def bgmv_expand_slice(self,
+                    inputs,
+                    lora_b_weights,
+                    lora_indices_tensor):
+        selected_loras = lora_b_weights[lora_indices_tensor]
+        inputs = inputs.astype(lora_b_weights[0].dtype)
+        selected_loras = selected_loras.squeeze(1)
+        outputs = einsum_ms(inputs, selected_loras)
+        return outputs
+
+    def update_metadata(
+            self,
+            mapping: "LoRAMapping",
+            lora_index_to_id: list[Optional[int]],
+            max_loras: int,
+            vocab_size: int,
+            extra_vocab_size: int,
+            long_lora_context: Optional["LongContextLoRAContext"] = None,
+            **kwargs):
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size,
+                                   long_lora_context)
+        if mapping.is_prefill:
+            # Update metadata required for prefill-related operators.
+            self._update_prefill_metadata(self.token_lora_indices)
+            self.is_prefill = True
+        else:
+            self.is_prefill = False
+        _, seq_len, lora_indices, _, _, _ = self.prefill_metadata
+        sorted_ids, sorted_counts = sort_lora_by_token_count(
+            lora_indices, seq_len)
+        group_list = sorted_counts
+        if len(group_list) < self.max_loras:
+            new_tensor = mint.zeros(self.max_loras, dtype=group_list.dtype)
+            new_tensor[:group_list.size(0)] = group_list
+            group_list = new_tensor
+        self.group_list.set_data(group_list.astype(dtype.int64))
 
     def add_lora_embedding(self,
                            y,
@@ -247,7 +142,7 @@ class PunicaWrapperNPU(PunicaWrapperBase):
             lora_b_stacked (ms.Tensor): lora_b's weights.
             add_inputs (bool): Default to True.
         """
-        #No LoRA request, so return directly
+        # No LoRA request, so return directly
         if self.no_lora:
             return
         # Embedding layer only need expand op
@@ -255,65 +150,6 @@ class PunicaWrapperNPU(PunicaWrapperBase):
                                 if self.is_prefill else self._expand_decode)
         expand_fun(y, x, lora_b_stacked, add_inputs)
 
-    def add_lora_linear(self,
-                        y,
-                        x,
-                        lora_a_stacked,
-                        lora_b_stacked,
-                        lora_bias_stacked,
-                        scale,
-                        output_slices,
-                        *,
-                        buffer=None,
-                        **kwargs) -> None:
-        """
-        Applicable to linear-related lora. 
-
-        Semantics:
-            for i in range(len(lora_a_stacked)):
-                y[i] += (
-                    x[i].unsqueeze(0)
-                    @ lora_a_stacked[indices[i], layer_idx, :, :]
-                    @ lora_b_stacked[indices[i], layer_idx, :, :]
-                    * scale
-                    ).squeeze(0)+lora_bias_stacked[i]
-
-        Args:
-            y (ms.Tensor): Output tensor. Will be changed in-place.
-            x (ms.Tensor): Input tensor
-            lora_a_stacked (Tuple[ms.Tensor, ...]): lora_a's weight.
-            lora_b_stacked (Tuple[ms.Tensor, ...]): lora_b's weight.
-            lora_bias_stacked (Optional[Tuple[ms.Tensor, ...]]): lora's bias.
-            scale (float): Scaling factor.
-            output_slices (Tuple[int, ...]): Every slice's size.
-            buffer (Optional[Tuple[ms.Tensor, ...]]): Defaults to None.
-        """
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        x = x.reshape(-1, x.shape[-1])
-        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
-        if lora_bias_stacked is not None:
-            assert len(lora_bias_stacked) == len(output_slices)
-            y = self._apply_bias(self.token_lora_indices, y, output_slices,
-                                 lora_bias_stacked)
-
-        if buffer is None:
-            r = lora_b_stacked[0].shape[-1]
-            # We set the buffer to be float32 by default, consistent with the
-            # triton op
-            buffer = tuple(
-                mint.zeros((x.shape[0], r), dtype=mstype.float32)
-                for _ in range(len(output_slices)))
-        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
-        self.add_expand(y,
-                        buffer,
-                        lora_b_stacked,
-                        None,
-                        output_slices,
-                        add_inputs=True,
-                        **kwargs)
-
     def add_lora_logits(self,
                         y,
                         x,
@@ -325,7 +161,7 @@ class PunicaWrapperNPU(PunicaWrapperBase):
                         **kwargs) -> None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
-        
+
         Semantics:
             buffer = (x @ lora_a_stacked) * scale
             y += buffer @ lora_b_stacked
@@ -338,7 +174,7 @@ class PunicaWrapperNPU(PunicaWrapperBase):
             scale (float): Scaling factor.
             buffer (Optional[ms.Tensor]):Default to None.
         """
-        #No LoRA request, so return directly
+        # No LoRA request, so return directly
         if self.no_lora:
             return
         y_org = y
@@ -357,3 +193,30 @@ class PunicaWrapperNPU(PunicaWrapperBase):
                     self.sampler_indices,
                     add_inputs=True)
         y.view_as(y_org)
+
+    def construct(self,
+                  y,
+                  x,
+                  lora_a_stacked,
+                  lora_b_stacked,
+                  lora_bias_stacked,
+                  scale,
+                  **kwargs):
+        if self.no_lora:
+            return
+        x = x.reshape(-1, x.shape[-1])
+        orign_shape = y.shape
+        y = y.reshape(-1, y.shape[-1])
+        if lora_bias_stacked is not None:
+            selected_loras_bias = lora_bias_stacked[self.token_lora_indices]
+            y = ops.add(y, selected_loras_bias)
+        _, seq_len, lora_indices, _, _, _ = self.prefill_metadata
+        if get_model_context("is_prefill"):
+            outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale)
+            outputs = self.sgmv_expand_slice(outputs, lora_a_stacked, lora_indices)
+        else:
+            outputs = self.bgmv_shrink(x, lora_b_stacked, self.group_list, scale)
+            outputs = self.bgmv_expand_slice(outputs, lora_a_stacked, lora_indices)
+        outputs = ops.add(y, outputs)
+        outputs = outputs.reshape(orign_shape)
+        return outputs
diff --git a/vllm_mindspore/lora/utils.py b/vllm_mindspore/lora/utils.py
index d9157467..53cc6d41 100644
--- a/vllm_mindspore/lora/utils.py
+++ b/vllm_mindspore/lora/utils.py
@@ -22,7 +22,7 @@ from vllm.lora.fully_sharded_layers import (
     RowParallelLinearWithShardedLoRA)
 
 # yapf conflicts with isort for this block
-# yapf: disable  # noqa: ERA001
+# yapf: disable
 from vllm_mindspore.lora.layers import (BaseLayerWithLoRA,
                                         ColumnParallelLinearWithLoRA,
                                         LinearScalingRotaryEmbeddingWithLoRA,
@@ -32,8 +32,9 @@ from vllm_mindspore.lora.layers import (BaseLayerWithLoRA,
                                         QKVParallelLinearWithLoRA,
                                         RowParallelLinearWithLoRA,
                                         VocabParallelEmbeddingWithLoRA)
+from vllm_mindspore.model_executor.layers.quantization.sparse_quant_modelslim import A8W8SCLinearMethod
 
-# yapf: enable  # noqa: ERA001
+# yapf: enable
 
 _all_lora_classes: set[type[BaseLayerWithLoRA]] = {
     VocabParallelEmbeddingWithLoRA,
@@ -50,3 +51,22 @@ _all_lora_classes: set[type[BaseLayerWithLoRA]] = {
     RowParallelLinearWithShardedLoRA,
     LinearScalingRotaryEmbeddingWithLoRA,
 }
+
+def replace_submodule(model, module_name, new_module):
+    """Replace a submodule in a model with a new module."""
+    parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+    target_name = module_name.split(".")[-1]
+    setattr(parent, target_name, new_module)
+    new_module.base_layer.weight.name = module_name + ".weight"
+    new_module.lora_a_stacked.name = module_name + ".lora_a_weight"
+    new_module.lora_b_stacked.name = module_name + ".lora_b_weight"
+    if new_module.base_layer.bias is not None:
+        new_module.base_layer.bias.name = module_name + ".bias"
+        #new_module.lora_bias_stacked.name = module_name + ".lora_bias"
+    if isinstance(new_module.base_layer.quant_method, A8W8SCLinearMethod):
+        new_module.base_layer.index.name = module_name + ".index"
+        new_module.base_layer.input_scale.name = module_name + ".input_scale"
+        new_module.base_layer.input_offset.name = module_name + ".input_offset"
+        new_module.base_layer.deq_scale.name = module_name + ".deq_scale"
+        new_module.base_layer.quant_bias.name = module_name + ".quant_bias"
+    return new_module
diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py
index c8b6bc16..955e3f62 100644
--- a/vllm_mindspore/model_executor/model_loader/utils.py
+++ b/vllm_mindspore/model_executor/model_loader/utils.py
@@ -193,36 +193,7 @@ def get_ms_model_architecture(
 
     return model_cls, arch
 
-
-def convert_uint64_to_fp32(arr: np.ndarray):
-    arr_fp32 = arr.view(np.float32)
-    output = arr_fp32[:, :, 0::2]
-    return output
-
-
-def np_int4data_pack_to_int8_3d(np_data):
-    np_data = np_data.astype(np.int8)
-    np_data &= 0x000F
-    np_data[::, ::, 0::2] <<= 0
-    np_data[::, ::, 1::2] <<= 4
-    np_int4_data = np_data[::, ::, 0::2] | np_data[::, ::, 1::2]
-    return np_int4_data
-
-
-def unpack_int8_to_int4_3d(packed_data):
-    low_nibbles = (packed_data & 0x0F).astype(np.uint8)
-    high_nibbles = ((packed_data >> 4) & 0x0F).astype(np.uint8)
-
-    unpacked = np.empty((*packed_data.shape[:2], packed_data.shape[2] * 2),
-                        dtype=np.uint8)
-    unpacked[..., 0::2] = low_nibbles
-    unpacked[..., 1::2] = high_nibbles
-
-    return unpacked
-
-
-def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
-                                  target_device: torch.device) -> None:
+def process_weights_after_loading(model, model_config) -> None:
     for _, module in model.named_modules():
         quant_method = getattr(module, "quant_method", None)
         if isinstance(quant_method, QuantizeMethodBase):
-- 
Gitee


From 0a287512604db05746c5c517134329bc88a58272 Mon Sep 17 00:00:00 2001
From: luolihao <luolihao2@huawei.com>
Date: Wed, 13 Aug 2025 14:55:00 +0800
Subject: [PATCH 09/14] fix bug and support lm_head to Nz

---
 .../distributed/communication_op.py           |  29 ++++-
 .../model_executor/layers/logits_processor.py | 121 ++++++++++++++----
 vllm_mindspore/model_executor/models/qwen2.py |   8 +-
 3 files changed, 124 insertions(+), 34 deletions(-)

diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py
index 475a282d..604703dd 100644
--- a/vllm_mindspore/distributed/communication_op.py
+++ b/vllm_mindspore/distributed/communication_op.py
@@ -24,9 +24,10 @@ Implement a unified communication interface for both graph and pynative mode.
 from typing import Any, Dict, Optional, Union
 import torch
 
-from mindspore import nn, ops
+from mindspore import Tensor, mint, nn, ops
 from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_world_size, get_tp_group)
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
+    get_tp_group)
 
 def cpu_broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor,
                                                                 Any]]] = None,
@@ -74,3 +75,27 @@ class AllGatherFromModelParallelRegion(nn.Cell):
         output = self.all_gather_into_tensor(input_)
         output = ops.swapaxes(output, 0, -1)
         return output
+
+
+class GatherFromModelParallelRegion(nn.Cell):
+    "Gather the input from model parallel region and concatenate."
+
+    def __init__(self):
+        super().__init__()
+        self.world_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        if self.world_size > 1:
+            self.tp_group = get_tp_group().device_group._name
+
+    def construct(self,
+                  input_: Tensor,
+                  dst: int = 0,
+                  dim: int = -1) -> Optional[Tensor]:
+        # Size and dimension.
+        if self.world_size == 1:
+            return input_
+        output = ops.CollectiveGather(dest_rank=dst,
+                                      group=self.tp_group)(mint.transpose(input_, 0, dim))
+        if self.tp_rank != dst:
+            return None
+        return mint.transpose(output, 0, dim)
\ No newline at end of file
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index ee8c8edc..6910804a 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -23,12 +23,14 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import vllm.envs as envs
-from mindspore import Tensor, mint, nn
-from vllm.config import current_platform
+from mindspore import Tensor, jit, mint, nn
+from vllm.config import current_platform, get_current_vllm_config
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_gather)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
+from vllm_mindspore.distributed.communication_op import (
+    AllGatherFromModelParallelRegion, GatherFromModelParallelRegion)
 from vllm_mindspore.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 
@@ -60,6 +62,9 @@ class LogitsProcessor(nn.Cell):
             scale: A scaling factor to apply to the logits.
         """
         super().__init__()
+        vllm_config = get_current_vllm_config()
+        self.vllm_config = vllm_config
+        self.is_graph_mode = bool(not vllm_config.model_config.enforce_eager)
         self.scale = scale
         self.vocab_size = vocab_size
         # Whether the input is logits (default is hidden states).
@@ -71,25 +76,101 @@ class LogitsProcessor(nn.Cell):
         # Whether to use gather or all-gather to gather the logits.
         self.use_all_gather = current_platform.use_all_gather()
 
+        if self.use_all_gather:
+            self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion()
+        else:
+            self.tensor_model_parallel_gather = GatherFromModelParallelRegion()
+        self.lm_head = None
+        self.run_model = None
+        self.cached_input_info = {}
+
+    def set_dynamic_inputs(self):
+        dyn_hidden_states = Tensor(shape=[None, None],
+                                   dtype=self.vllm_config.model_config.dtype)
+
+        if self.cached_input_info["indices"] is None:
+            dyn_indices = None
+        else:
+            dyn_indices_shape = [
+                None for _ in range(self.cached_input_info["indices"]["ndim"])
+            ]
+            dyn_indices_dtype = self.cached_input_info["indices"]["dtype"]
+            dyn_indices = Tensor(shape=dyn_indices_shape,
+                                 dtype=dyn_indices_dtype)
+
+        if self.cached_input_info["bias"] is None:
+            dyn_bias = None
+        else:
+            dyn_bias_shape = [
+                None for _ in range(self.cached_input_info["bias"]["ndim"])
+            ]
+            dyn_bias_dtype = self.cached_input_info["bias"]["dtype"]
+            dyn_bias = Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype)
+
+        self.set_inputs(dyn_hidden_states, dyn_indices, dyn_bias)
+
+    def __call__(
+            self,
+            lm_head: VocabParallelEmbedding,
+            hidden_states: Tensor,
+            sampling_metadata: Optional[SamplingMetadata] = None,
+            embedding_bias: Optional[Tensor] = None,
+    ) -> Optional[Tensor]:
+        if self.lm_head is None:
+            self.lm_head = lm_head
+        if self.run_model is None:
+            self.run_model = jit(
+                function=self.construct,
+                jit_level='O0') if self.is_graph_mode else self.construct
+        selected_token_indices = None
+        if sampling_metadata is not None:
+            selected_token_indices = sampling_metadata.selected_token_indices
+        dyn_indices_info = None if selected_token_indices is None else {
+            "ndim": selected_token_indices.ndim,
+            "dtype": selected_token_indices.dtype,
+        }
+        dyn_bias_info = None if embedding_bias is None else {
+            "ndim": embedding_bias.ndim,
+            "dtype": embedding_bias.dtype,
+        }
+        if self.cached_input_info != {"indices": dyn_indices_info,
+                                      "bias": dyn_bias_info}:
+            self.cached_input_info = {
+                "indices": dyn_indices_info,
+                "bias": dyn_bias_info,
+            }
+            self.set_dynamic_inputs()
+
+        logits = self.run_model(
+            hidden_states,
+            selected_token_indices,
+            embedding_bias
+        )
+
+        if sampling_metadata is not None and \
+                sampling_metadata.seq_groups is not None:
+            logits = _apply_logits_processors(logits, sampling_metadata)
+
+        return logits
+
     def construct(
         self,
-        lm_head: VocabParallelEmbedding,
         hidden_states: Tensor,
-        sampling_metadata: Optional[SamplingMetadata] = None,
+        selected_token_indices: Optional[Tensor] = None,
         embedding_bias: Optional[Tensor] = None,
     ) -> Optional[Tensor]:
         if self.logits_as_input:
             logits = hidden_states
         else:
-            if sampling_metadata is not None:
-                if sampling_metadata.selected_token_indices.numel() <= 0:
-                    return mint.zeros((0, self.vocab_size),
-                                      dtype=hidden_states.dtype)
-                hidden_states = _prune_hidden_states(hidden_states,
-                                                     sampling_metadata)
+            if selected_token_indices is not None:
+                if selected_token_indices.numel() <= 0:
+                    return mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype)
+                hidden_states = mint.index_select(
+                    hidden_states, 0, selected_token_indices)
 
             # Get the logits for the next tokens.
-            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
+            logits = self._get_logits(
+                hidden_states, self.lm_head, embedding_bias)
         if logits is not None:
             if self.soft_cap is not None:
                 logits = logits / self.soft_cap
@@ -100,9 +181,6 @@ class LogitsProcessor(nn.Cell):
                 logits *= self.scale
 
             # Apply logits processors (if any).
-            if sampling_metadata is not None and \
-                    sampling_metadata.seq_groups is not None:
-                logits = _apply_logits_processors(logits, sampling_metadata)
 
         return logits
 
@@ -118,10 +196,10 @@ class LogitsProcessor(nn.Cell):
                                             bias=embedding_bias)
         if self.use_all_gather:
             # Gather is not supported for some devices such as NPUs.
-            logits = tensor_model_parallel_all_gather(logits)
+            logits = self.tensor_model_parallel_all_gather(logits)
         else:
             # None may be returned for rank > 0
-            logits = tensor_model_parallel_gather(logits)
+            logits = self.tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[..., :self.org_vocab_size]
@@ -134,17 +212,6 @@ class LogitsProcessor(nn.Cell):
         return s
 
 
-def _prune_hidden_states(
-    hidden_states: Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> Tensor:
-    indices = sampling_metadata.selected_token_indices
-    if indices is not None and indices.numel() > 0:
-        return mint.index_select(hidden_states, 0,
-                                 sampling_metadata.selected_token_indices)
-    return hidden_states
-
-
 def _apply_logits_processors(
     logits: Tensor,
     sampling_metadata: SamplingMetadata,
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index eff9be31..da4a2238 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -379,6 +379,7 @@ class Qwen2Model(nn.Cell):
                 continue
 
             param = params_dict[name]
+            loaded_weight = ms.Tensor(loaded_weight[:], dtype=param.dtype)
             param.set_data(loaded_weight.contiguous())
 
         def adjust_weight(params_dict):
@@ -390,7 +391,7 @@ class Qwen2Model(nn.Cell):
                 "o_proj.weight",
                 "gate_up_proj.weight",
                 "down_proj.weight",
-                # "lm_head.weight",
+                "lm_head.weight",
             ]
 
             rank_id = get_rank()
@@ -460,9 +461,6 @@ class Qwen2Model(nn.Cell):
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    # Norm type in weights may be f32
-                    if(loaded_weight.dtype != param.dtype):
-                        loaded_weight = loaded_weight.to(dtype=param.dtype)
                     weight_loader(param, loaded_weight)
                     loaded_params.add(name)
 
@@ -475,7 +473,7 @@ class Qwen2Model(nn.Cell):
                 "o_proj.weight",
                 "gate_up_proj.weight",
                 "down_proj.weight",
-                # "lm_head.weight",
+                "lm_head.weight",
             ]
 
             for name, param in params_dict.items():
-- 
Gitee


From 3414ad00c1564f6dd1b0a56dfa29795e430641d2 Mon Sep 17 00:00:00 2001
From: huangzhuo <huangzhuo17@huawei.com>
Date: Sat, 23 Aug 2025 10:00:32 +0800
Subject: [PATCH 10/14] fix conflict

---
 vllm_mindspore/__init__.py                    |  3 +-
 vllm_mindspore/lora/layers.py                 | 91 ++++++++++---------
 .../lora/punica_wrapper/punica_npu.py         | 13 +--
 .../model_executor/model_loader/utils.py      |  2 +-
 .../model_executor/models/model_base.py       |  5 +-
 vllm_mindspore/v1/worker/gpu_model_runner.py  | 10 +-
 6 files changed, 60 insertions(+), 64 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 7073717d..1f404e28 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -125,10 +125,11 @@ vllm.utils.memory_profiling = ms_memory_profiling
 import vllm.lora.utils
 
 from vllm_mindspore.model_executor.layers.linear import LinearBase
-from vllm_mindspore.lora.utils import _all_lora_classes
+from vllm_mindspore.lora.utils import _all_lora_classes, replace_submodule
 
 vllm.lora.utils._all_lora_classes = _all_lora_classes
 vllm.lora.utils.LinearBase = LinearBase
+vllm.lora.utils.replace_submodule = replace_submodule
 
 import vllm.lora.models
 from vllm_mindspore.lora.models import (
diff --git a/vllm_mindspore/lora/layers.py b/vllm_mindspore/lora/layers.py
index 7d565138..f5749280 100644
--- a/vllm_mindspore/lora/layers.py
+++ b/vllm_mindspore/lora/layers.py
@@ -362,37 +362,11 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
             lora_b = self.slice_lora_b(lora_b)
             if lora_bias is not None:
                 lora_bias = self.slice_bias(lora_bias)
-        if self.n_slices == 3:
-            self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0]
-            self.lora_a_stacked[index, :, lora_a[0].shape[1] : lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1]
-            self.lora_a_stacked[index, :, lora_a[0].shape[1] + lora_a[1].shape[1] :] = lora_a[2]
-            self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0]
-            self.lora_b_stacked[index, lora_b[0].shape[0] : lora_b[0].shape[0] + lora_b[1].shape[0], lora_b[0].shape[1] : lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1]
-            self.lora_b_stacked[index, lora_b[0].shape[0] + lora_b[1].shape[0] :, lora_b[0].shape[1] + lora_b[1].shape[1] :] = lora_b[2]
-            if self.lora_bias_stacked is not None:
-                assert len(self.lora_bias_stacked)
-                lora_bias = ops.concat(lora_bias, axis=0)
-                self.lora_bias_stacked[index] = lora_bias
-        elif self.n_slices == 2:
-            self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0]
-            self.lora_a_stacked[index, :, lora_a[0].shape[1]: lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1]
-            self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0]
-            self.lora_b_stacked[index, lora_b[0].shape[0]: lora_b[0].shape[0] + lora_b[1].shape[0],
-            lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1]
-            if self.lora_bias_stacked is not None:
-                assert len(self.lora_bias_stacked)
-                lora_bias = ops.concat(lora_bias, axis=0)
-                self.lora_bias_stacked[index] = lora_bias
-            if self.lora_bias_stacked is not None:
-                assert len(self.lora_bias_stacked)
-                lora_bias = ops.concat(lora_bias, axis=0)
-                self.lora_bias_stacked[index] = lora_bias
-        else:
-            self.lora_a_stacked[index] = lora_a
-            self.lora_b_stacked[index] = lora_b
-            if self.lora_bias_stacked is not None:
-                assert len(self.lora_bias_stacked)
-                self.lora_bias_stacked[index] = lora_bias
+        self.lora_a_stacked[index, :, :lora_a.shape[1]] = lora_a
+        self.lora_b_stacked[index, :lora_b.shape[0], :] = lora_b
+        if self.lora_bias_stacked is not None:
+            assert len(self.lora_bias_stacked)
+            self.lora_bias_stacked[index] = lora_bias
 
     def apply(self,
               x: ms.Tensor,
@@ -594,20 +568,16 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             lora_b = self.slice_lora_b(lora_b)
             if lora_bias is not None:
                 lora_bias = self.slice_bias(lora_bias)
-        lora_weight_list = []
-        for i in range(self.n_slices):
-            if (lora_a_i := lora_a[i]) is not None and (lora_b_i := lora_b[i]) is not None:
-                lora_weight_list.append(ops.matmul(lora_a_i.transpose(1, 0), lora_b_i.transpose(1, 0)))
-        lora_weight = ops.cat(tuple(lora_weight_list), axis=-1)
-        self.lora_weight[index] = lora_weight
-
-        lora_bias_list = []
-        if lora_bias is not None:
-            for i in range(self.n_slices):
-                if (lora_bias_i := lora_bias[i]) is not None:
-                    lora_bias_list.appendd(lora_bias_i)
-            lora_bias = ops.cat(tuple(lora_weight_list), axis=0)
-            self.lora_bias_stacked[index] =  lora_bias
+        self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0]
+        self.lora_a_stacked[index, :, self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_a[1].shape[1]] = lora_a[1]
+        self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0]
+        self.lora_b_stacked[index, self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_b[1].shape[0],
+        lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1]
+        if self.lora_bias_stacked is not None:
+            assert len(self.lora_bias_stacked)
+            lora_bias = ops.concat(lora_bias, axis=0)
+            self.lora_bias_stacked[index] = lora_bias
+
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -726,6 +696,37 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
             self.kv_shard_id,
         )
 
+    def set_lora(
+        self,
+        index: int,
+        lora_a: ms.Tensor,
+        lora_b: ms.Tensor,
+        embeddings_tensor: Optional[ms.Tensor],
+        lora_bias: Optional[ms.Tensor] = None,
+    ):
+        self.reset_lora(index)
+
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+        self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0]
+        self.lora_a_stacked[index, :,
+        self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_a[1].shape[1]] = lora_a[1]
+        self.lora_a_stacked[index, :,
+        self.lora_config.max_lora_rank * 2: self.lora_config.max_lora_rank * 2 + lora_a[2].shape[1]] = lora_a[2]
+        self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0]
+        self.lora_b_stacked[index, self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_b[1].shape[0],
+        lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1]
+        self.lora_b_stacked[index,
+        self.lora_config.max_lora_rank * 2: self.lora_config.max_lora_rank * 2 + lora_b[2].shape[0],
+        lora_b[0].shape[1] + lora_b[1].shape[1]:] = lora_b[2]
+        if self.lora_bias_stacked is not None:
+            assert len(self.lora_bias_stacked)
+            lora_bias = ops.concat(lora_bias, axis=0)
+            self.lora_bias_stacked[index] = lora_bias
+
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(
diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
index d9295f66..d5a1394c 100644
--- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py
+++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
@@ -70,8 +70,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
                     scaling=1.0):
         selected_loras = lora_a_weights[lora_indices_tensor]
         inputs = inputs.astype(lora_a_weights[0].dtype)
-        selected_loras = selected_loras.squeeze(1)
-        outputs = einsum_ms(inputs, selected_loras)
+        outputs = ops.matmul(inputs.unsqueeze(1), selected_loras).squeeze(1)
         return scaling * outputs
 
     def sgmv_expand_slice(self,
@@ -92,8 +91,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
                     lora_indices_tensor):
         selected_loras = lora_b_weights[lora_indices_tensor]
         inputs = inputs.astype(lora_b_weights[0].dtype)
-        selected_loras = selected_loras.squeeze(1)
-        outputs = einsum_ms(inputs, selected_loras)
+        outputs = ops.matmul(inputs.unsqueeze(1), selected_loras).squeeze(1)
         return outputs
 
     def update_metadata(
@@ -210,13 +208,12 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
         if lora_bias_stacked is not None:
             selected_loras_bias = lora_bias_stacked[self.token_lora_indices]
             y = ops.add(y, selected_loras_bias)
-        _, seq_len, lora_indices, _, _, _ = self.prefill_metadata
         if get_model_context("is_prefill"):
             outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale)
-            outputs = self.sgmv_expand_slice(outputs, lora_a_stacked, lora_indices)
+            outputs = self.sgmv_expand_slice(outputs, lora_b_stacked, self.group_list)
         else:
-            outputs = self.bgmv_shrink(x, lora_b_stacked, self.group_list, scale)
-            outputs = self.bgmv_expand_slice(outputs, lora_a_stacked, lora_indices)
+            outputs = self.bgmv_shrink(x, lora_a_stacked, self.token_lora_indices, scale)
+            outputs = self.bgmv_expand_slice(outputs, lora_b_stacked, self.token_lora_indices)
         outputs = ops.add(y, outputs)
         outputs = outputs.reshape(orign_shape)
         return outputs
diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py
index 955e3f62..0ace6cea 100644
--- a/vllm_mindspore/model_executor/model_loader/utils.py
+++ b/vllm_mindspore/model_executor/model_loader/utils.py
@@ -193,7 +193,7 @@ def get_ms_model_architecture(
 
     return model_cls, arch
 
-def process_weights_after_loading(model, model_config) -> None:
+def process_weights_after_loading(model, model_config, target_device) -> None:
     for _, module in model.named_modules():
         quant_method = getattr(module, "quant_method", None)
         if isinstance(quant_method, QuantizeMethodBase):
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index dd28a8c6..78f24d62 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -160,6 +160,7 @@ class MsModelBase:
         config = vllm_config.model_config.hf_config
         lora_config = vllm_config.lora_config
 
+        self.vllm_config = vllm_config
         self.config = config
         self.model_config = vllm_config.model_config
         self.lora_config = lora_config
@@ -412,9 +413,9 @@ class NativeModel(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         self.quant_config = vllm_config.quant_config
-        if vllm_config.lora_config is not None:
+        #if vllm_config.lora_config is not None:
             # native model lora only support pynative mode now
-            vllm_config.model_config.enforce_eager = True
+        #    vllm_config.model_config.enforce_eager = True
         self.is_eager_mode = vllm_config.model_config.enforce_eager
         self.prefill_graph = None
         self.decode_graph = None
diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py
index 56a8a120..e5ffef5e 100644
--- a/vllm_mindspore/v1/worker/gpu_model_runner.py
+++ b/vllm_mindspore/v1/worker/gpu_model_runner.py
@@ -44,7 +44,6 @@ from vllm_mindspore.utils import (create_kv_cache, get_dtype_size,
                                   get_valid_dtype, is_310p, FORMAT_TYPE)
 from vllm_mindspore.v1.kv_cache_interface import MLAQuantFullAttentionSpec
 
-
 logger = init_logger(__name__)
 
 
@@ -402,11 +401,11 @@ def _reshape_kv_cache_tensors(
     Reshape the KV cache tensors to the desired shape and dtype.
 
     Args:
-        kv_cache_config: The KV cache config 
-        kv_cache_raw_tensors: The KV cache buffer of each layer, with 
+        kv_cache_config: The KV cache config
+        kv_cache_raw_tensors: The KV cache buffer of each layer, with
         correct size but uninitialized shape.
     Returns:
-        Dict[str, Tensor]: A map between layer names to their 
+        Dict[str, Tensor]: A map between layer names to their
         corresponding memory buffer for KV cache.
     """
     # Determine whether deepseek use mla op
@@ -444,9 +443,6 @@ def _reshape_kv_cache_tensors(
                 kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
                     num_blocks, kv_cache_spec.block_size,
                     kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
-                if is_310p():
-                    *dims, second_last, last = kv_cache_shape
-                    kv_cache_shape = (*dims, second_last * last)
                 try:
                     kv_cache_stride_order = self.attn_backends[
                         i].get_kv_cache_stride_order()
-- 
Gitee


From 5d72f0328d44b617c9a2074f91bf45307a2a086a Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Tue, 5 Aug 2025 19:40:26 +0800
Subject: [PATCH 11/14] lm_head support jit

---
 .../distributed/communication_op.py           |  4 +-
 .../model_executor/layers/logits_processor.py | 47 ++++++++-----------
 2 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py
index 604703dd..86a0d797 100644
--- a/vllm_mindspore/distributed/communication_op.py
+++ b/vllm_mindspore/distributed/communication_op.py
@@ -23,7 +23,6 @@ Implement a unified communication interface for both graph and pynative mode.
 
 from typing import Any, Dict, Optional, Union
 import torch
-
 from mindspore import Tensor, mint, nn, ops
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
@@ -98,4 +97,5 @@ class GatherFromModelParallelRegion(nn.Cell):
                                       group=self.tp_group)(mint.transpose(input_, 0, dim))
         if self.tp_rank != dst:
             return None
-        return mint.transpose(output, 0, dim)
\ No newline at end of file
+        return mint.transpose(output, 0, dim)
+
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index 6910804a..1b1770cc 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -75,7 +75,7 @@ class LogitsProcessor(nn.Cell):
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
         self.use_all_gather = current_platform.use_all_gather()
-
+        
         if self.use_all_gather:
             self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion()
         else:
@@ -85,36 +85,28 @@ class LogitsProcessor(nn.Cell):
         self.cached_input_info = {}
 
     def set_dynamic_inputs(self):
-        dyn_hidden_states = Tensor(shape=[None, None],
-                                   dtype=self.vllm_config.model_config.dtype)
-
-        if self.cached_input_info["indices"] is None:
-            dyn_indices = None
-        else:
-            dyn_indices_shape = [
-                None for _ in range(self.cached_input_info["indices"]["ndim"])
-            ]
-            dyn_indices_dtype = self.cached_input_info["indices"]["dtype"]
-            dyn_indices = Tensor(shape=dyn_indices_shape,
-                                 dtype=dyn_indices_dtype)
-
-        if self.cached_input_info["bias"] is None:
-            dyn_bias = None
-        else:
-            dyn_bias_shape = [
-                None for _ in range(self.cached_input_info["bias"]["ndim"])
-            ]
-            dyn_bias_dtype = self.cached_input_info["bias"]["dtype"]
-            dyn_bias = Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype)
+        dyn_hidden_states = Tensor(
+            shape=[None, None], dtype=self.vllm_config.model_config.dtype)
+        
+        dyn_indices_shape = [None for _ in range(
+                    self.cached_input_info["indices"]["ndim"])]
+        dyn_indices_dtype = self.cached_input_info["indices"]["dtype"]
+        dyn_indices = None if self.cached_input_info["indices"] is None else \
+            Tensor(shape=dyn_indices_shape, dtype=dyn_indices_dtype)
+
+        dyn_bias_shape = [None for _ in range(
+                    self.cached_input_info["bias"]["ndim"])]
+        dyn_bias_dtype = self.cached_input_info["bias"]["dtype"]
+        dyn_bias = None if self.cached_input_info["bias"] is None else \
+            Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype)
 
         self.set_inputs(dyn_hidden_states, dyn_indices, dyn_bias)
 
     def __call__(
-            self,
-            lm_head: VocabParallelEmbedding,
-            hidden_states: Tensor,
-            sampling_metadata: Optional[SamplingMetadata] = None,
-            embedding_bias: Optional[Tensor] = None,
+        self,
+        hidden_states: Tensor,
+        selected_token_indices: Optional[Tensor] = None,
+        embedding_bias: Optional[Tensor] = None,
     ) -> Optional[Tensor]:
         if self.lm_head is None:
             self.lm_head = lm_head
@@ -181,7 +173,6 @@ class LogitsProcessor(nn.Cell):
                 logits *= self.scale
 
             # Apply logits processors (if any).
-
         return logits
 
     def _get_logits(
-- 
Gitee


From ff2500ef32eb3ae10b4605c0f42b1ef420342618 Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Tue, 5 Aug 2025 20:14:33 +0800
Subject: [PATCH 12/14] bugfix

---
 .../model_executor/layers/logits_processor.py | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index 1b1770cc..27c19106 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -85,20 +85,27 @@ class LogitsProcessor(nn.Cell):
         self.cached_input_info = {}
 
     def set_dynamic_inputs(self):
-        dyn_hidden_states = Tensor(
-            shape=[None, None], dtype=self.vllm_config.model_config.dtype)
-        
-        dyn_indices_shape = [None for _ in range(
-                    self.cached_input_info["indices"]["ndim"])]
-        dyn_indices_dtype = self.cached_input_info["indices"]["dtype"]
-        dyn_indices = None if self.cached_input_info["indices"] is None else \
-            Tensor(shape=dyn_indices_shape, dtype=dyn_indices_dtype)
+        dyn_hidden_states = Tensor(shape=[None, None],
+                                   dtype=self.vllm_config.model_config.dtype)
+
+        if self.cached_input_info["indices"] is None:
+            dyn_indices = None
+        else:
+            dyn_indices_shape = [
+                None for _ in range(self.cached_input_info["indices"]["ndim"])
+            ]
+            dyn_indices_dtype = self.cached_input_info["indices"]["dtype"]
+            dyn_indices = Tensor(shape=dyn_indices_shape,
+                                 dtype=dyn_indices_dtype)
 
-        dyn_bias_shape = [None for _ in range(
-                    self.cached_input_info["bias"]["ndim"])]
-        dyn_bias_dtype = self.cached_input_info["bias"]["dtype"]
-        dyn_bias = None if self.cached_input_info["bias"] is None else \
-            Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype)
+        if self.cached_input_info["bias"] is None:
+            dyn_bias = None
+        else:
+            dyn_bias_shape = [
+                None for _ in range(self.cached_input_info["bias"]["ndim"])
+            ]
+            dyn_bias_dtype = self.cached_input_info["bias"]["dtype"]
+            dyn_bias = Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype)
 
         self.set_inputs(dyn_hidden_states, dyn_indices, dyn_bias)
 
-- 
Gitee


From d4d6e4ae54426122160e0d8796759815ba2b6b74 Mon Sep 17 00:00:00 2001
From: huangzhuo <huangzhuo17@huawei.com>
Date: Mon, 25 Aug 2025 11:00:32 +0800
Subject: [PATCH 13/14] qwen2.5 enable V0

---
 vllm_mindspore/__init__.py                    |  3 --
 vllm_mindspore/config.py                      |  4 --
 .../lora/punica_wrapper/punica_npu.py         | 54 +++++++++----------
 .../model_executor/layers/logits_processor.py | 39 +++++++-------
 .../quantization/smooth_quant_modelslim.py    |  7 +--
 .../model_executor/models/model_base.py       |  4 +-
 6 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 1f404e28..3dac7aca 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -300,7 +300,6 @@ from .config import (
     _verify_quantization,
     _verify_args,
     vllm_config_post_init,
-    vllm_config_get_quantization_config,
     model_post_init,
     _get_and_verify_dtype,
     stateless_init_dp_group,
@@ -309,8 +308,6 @@ from .config import (
 
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
 vllm.config.VllmConfig.__post_init__ = vllm_config_post_init
-vllm.config.VllmConfig._get_quantization_config = staticmethod(
-    vllm_config_get_quantization_config)
 vllm.config.SchedulerConfig._verify_args = _verify_args
 vllm.config.CompilationConfig.model_post_init = model_post_init
 vllm.config._get_and_verify_dtype = _get_and_verify_dtype
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 5f7da8c5..c9c16690 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -46,10 +46,6 @@ def _verify_quantization(self) -> None:
     return
 
 
-def vllm_config_get_quantization_config(model_config, load_config):
-    return None
-
-
 def vllm_config_post_init(self):
     """Verify configs are valid & consistent with each other."""
     if self.model_config is not None:
diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
index d5a1394c..76e46a29 100644
--- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py
+++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
@@ -24,12 +24,12 @@ from typing import Callable, Optional
 from mindspore import mint, nn, Parameter, ops, dtype
 from mindspore.common import dtype as mstype
 from mindspore.common.initializer import initializer
-from mindspore.ops.auto_generate import grouped_matmul_v4
+from mindspore.ops.auto_generate import grouped_matmul_v4, GroupedMatmul
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
-
+import vllm.envs as envs
 from vllm_mindspore.lora.ops.torch_ops.lora_ops import (
     bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand,
-    sgmv_expand_slice, sgmv_shrink, sort_lora_by_token_count, einsum_ms)
+    sgmv_expand_slice, sgmv_shrink, einsum_ms)
 from vllm_mindspore.model_executor.utils import get_model_context
 
 
@@ -47,21 +47,22 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
                                    device)
         self.max_loras = kwargs["max_loras"]
+        self.gmm = GroupedMatmul(split_item=3, group_type=0)
         self.group_list = Parameter(initializer("ones", self.max_loras, dtype.int64), name="group_list")
         self.lora_indices = Parameter(initializer("ones", self.max_loras, dtype.int64), name="lora_indices")
 
     def sgmv_shrink(self,
-                    inputs,
-                    lora_a_weights,
-                    group_list,
-                    scaling,
-                    ):
+                        inputs,
+                        lora_a_weights,
+                        group_list,
+                        scaling,
+                        ):
         outputs = grouped_matmul_v4([inputs], [lora_a_weights],
                                     group_list=group_list,
                                     split_item=3,
                                     group_type=0,
                                     group_list_type=1)[0]
-        return outputs * scaling
+        return outputs
 
     def bgmv_shrink(self,
                     inputs,
@@ -71,7 +72,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
         selected_loras = lora_a_weights[lora_indices_tensor]
         inputs = inputs.astype(lora_a_weights[0].dtype)
         outputs = ops.matmul(inputs.unsqueeze(1), selected_loras).squeeze(1)
-        return scaling * outputs
+        return outputs
 
     def sgmv_expand_slice(self,
                     inputs,
@@ -106,21 +107,16 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
         self._update_base_metadata(mapping, lora_index_to_id, max_loras,
                                    vocab_size, extra_vocab_size,
                                    long_lora_context)
+        self._update_prefill_metadata(self.token_lora_indices)
         if mapping.is_prefill:
             # Update metadata required for prefill-related operators.
-            self._update_prefill_metadata(self.token_lora_indices)
             self.is_prefill = True
         else:
             self.is_prefill = False
         _, seq_len, lora_indices, _, _, _ = self.prefill_metadata
-        sorted_ids, sorted_counts = sort_lora_by_token_count(
-            lora_indices, seq_len)
-        group_list = sorted_counts
-        if len(group_list) < self.max_loras:
-            new_tensor = mint.zeros(self.max_loras, dtype=group_list.dtype)
-            new_tensor[:group_list.size(0)] = group_list
-            group_list = new_tensor
-        self.group_list.set_data(group_list.astype(dtype.int64))
+        new_tensor = ops.zeros(self.max_loras, dtype=self.group_list.dtype)
+        new_tensor[lora_indices] = seq_len
+        self.group_list.set_data(new_tensor.astype(dtype.int64))
 
     def add_lora_embedding(self,
                            y,
@@ -182,7 +178,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
         if buffer is None:
             # We set the buffer to be float32 by default, consistent with the
             # triton op
-            buffer = mint.zeros((x.shape[0], r), dtype=mstype.float32)
+            buffer = ops.zeros((x.shape[0], r), dtype=mstype.float32)
         # LogitsProcessorWithLoRA always using bgmv.
         bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
         bgmv_expand(buffer,
@@ -201,19 +197,23 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
                   scale,
                   **kwargs):
         if self.no_lora:
-            return
+            return y
         x = x.reshape(-1, x.shape[-1])
         orign_shape = y.shape
         y = y.reshape(-1, y.shape[-1])
         if lora_bias_stacked is not None:
             selected_loras_bias = lora_bias_stacked[self.token_lora_indices]
             y = ops.add(y, selected_loras_bias)
-        if get_model_context("is_prefill"):
-            outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale)
-            outputs = self.sgmv_expand_slice(outputs, lora_b_stacked, self.group_list)
+        if not envs.VLLM_USE_V1:
+            shrink_outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale)
+            expand_outputs = self.sgmv_expand_slice(shrink_outputs, lora_b_stacked, self.group_list)
         else:
-            outputs = self.bgmv_shrink(x, lora_a_stacked, self.token_lora_indices, scale)
-            outputs = self.bgmv_expand_slice(outputs, lora_b_stacked, self.token_lora_indices)
-        outputs = ops.add(y, outputs)
+            if get_model_context("is_prefill"):
+                shrink_outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale)
+                expand_outputs = self.sgmv_expand_slice(shrink_outputs, lora_b_stacked, self.group_list)
+            else:
+                shrink_outputs = self.bgmv_shrink(x, lora_a_stacked, self.token_lora_indices, scale)
+                expand_outputs = self.bgmv_expand_slice(shrink_outputs, lora_b_stacked, self.token_lora_indices)
+        outputs = ops.add(y, expand_outputs)
         outputs = outputs.reshape(orign_shape)
         return outputs
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index 27c19106..6f829079 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -76,10 +76,10 @@ class LogitsProcessor(nn.Cell):
         # Whether to use gather or all-gather to gather the logits.
         self.use_all_gather = current_platform.use_all_gather()
         
-        if self.use_all_gather:
-            self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion()
-        else:
-            self.tensor_model_parallel_gather = GatherFromModelParallelRegion()
+        # if self.use_all_gather:
+        self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion()
+        # else:
+        #     self.tensor_model_parallel_gather = GatherFromModelParallelRegion()
         self.lm_head = None
         self.run_model = None
         self.cached_input_info = {}
@@ -111,8 +111,9 @@ class LogitsProcessor(nn.Cell):
 
     def __call__(
         self,
+        lm_head: VocabParallelEmbedding,
         hidden_states: Tensor,
-        selected_token_indices: Optional[Tensor] = None,
+        sampling_metadata: Optional[SamplingMetadata] = None,
         embedding_bias: Optional[Tensor] = None,
     ) -> Optional[Tensor]:
         if self.lm_head is None:
@@ -139,12 +140,14 @@ class LogitsProcessor(nn.Cell):
                 "bias": dyn_bias_info,
             }
             self.set_dynamic_inputs()
-
-        logits = self.run_model(
-            hidden_states,
-            selected_token_indices,
-            embedding_bias
-        )
+        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
+            logits = mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype)
+        else:
+            logits = self.run_model(
+                hidden_states,
+                selected_token_indices,
+                embedding_bias
+            )
 
         if sampling_metadata is not None and \
                 sampling_metadata.seq_groups is not None:
@@ -157,13 +160,11 @@ class LogitsProcessor(nn.Cell):
         hidden_states: Tensor,
         selected_token_indices: Optional[Tensor] = None,
         embedding_bias: Optional[Tensor] = None,
-    ) -> Optional[Tensor]:
+        ) -> Optional[Tensor]:
         if self.logits_as_input:
             logits = hidden_states
         else:
             if selected_token_indices is not None:
-                if selected_token_indices.numel() <= 0:
-                    return mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype)
                 hidden_states = mint.index_select(
                     hidden_states, 0, selected_token_indices)
 
@@ -192,12 +193,12 @@ class LogitsProcessor(nn.Cell):
         logits = lm_head.quant_method.apply(lm_head,
                                             hidden_states,
                                             bias=embedding_bias)
-        if self.use_all_gather:
+        # if self.use_all_gather:
             # Gather is not supported for some devices such as NPUs.
-            logits = self.tensor_model_parallel_all_gather(logits)
-        else:
-            # None may be returned for rank > 0
-            logits = self.tensor_model_parallel_gather(logits)
+        logits = self.tensor_model_parallel_all_gather(logits)
+        # else:
+        #     # None may be returned for rank > 0
+        #     logits = self.tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[..., :self.org_vocab_size]
diff --git a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py
index cad2f322..6003a667 100644
--- a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py
+++ b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py
@@ -18,6 +18,7 @@ from typing import Any, Optional
 
 import mindspore
 import numpy as np
+import regex as re
 from mindspore import Parameter, Tensor, ops
 from mindspore.common.initializer import initializer
 from mindspore.ops.auto_generate import (DynamicQuantExt, GroupedMatmul,
@@ -157,6 +158,7 @@ class A8W8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: SmoothQuantModelSlimConfig):
         self.quant_config = quant_config
         self.quant = QuantV2()
+        self.bias_add = ops.Add()
 
     def create_weights(self,
                        layer: mindspore.nn.Cell,
@@ -374,8 +376,6 @@ class A8W8LinearMethod(LinearMethodBase):
         else:
             qx = self.matmul(qx, weight, deq_scale, None, layer.quant_bias,
                              None)
-        if bias is not None:
-            qx = self.bias_add(qx, bias)
         qx = qx.reshape(output_shape)
         return qx
 
@@ -386,6 +386,7 @@ class A8W8DYNLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: SmoothQuantModelSlimConfig):
         self.quant_config = quant_config
         self.quant = DynamicQuantExt()
+        self.bias_add = ops.Add()
 
     def create_weights(self,
                        layer: mindspore.nn.Cell,
@@ -508,6 +509,6 @@ class A8W8DYNLinearMethod(LinearMethodBase):
         else:
             qx = self.matmul(qx, weight, weight_scale, None, None, qx_scale)
         if bias is not None:
-            qx = mint.add(qx, bias)
+            qx = ops.add(qx, bias)
         qx = qx.reshape(output_shape)
         return qx
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 78f24d62..b149e94b 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -397,8 +397,8 @@ class MsModelBase:
         model_inputs = {}
         model_inputs["input_ids"] = input_ids
         model_inputs["batch_valid_length"] = ms.from_numpy(seq_lens_np)
-        model_inputs["block_tables"] = attn_metadata.block_tables
-        model_inputs["slot_mapping"] = attn_metadata.slot_mapping
+        model_inputs["block_tables"] = attn_metadata.block_tables + 0
+        model_inputs["slot_mapping"] = attn_metadata.slot_mapping + 0
         model_inputs["position_ids"] = position_ids
         model_inputs["q_seq_lens"] = q_seq_lens
         model_inputs["attention_mask"] = attention_mask
-- 
Gitee


From 0e7e072fd1f285b0be2fb823c6681163ca5dd0fd Mon Sep 17 00:00:00 2001
From: huangzhuo <huangzhuo17@huawei.com>
Date: Wed, 3 Sep 2025 15:11:20 +0800
Subject: [PATCH 14/14] v0 add new graph for lora

---
 vllm_mindspore/__init__.py                    |   2 +
 .../lora/punica_wrapper/punica_npu.py         |  15 +-
 .../models/mf_models/mf_model_base.py         | 217 +++++++++++
 .../models/mf_models/weight_processor.py      | 342 ------------------
 .../model_executor/models/model_base.py       |  82 ++++-
 vllm_mindspore/model_executor/utils.py        |   3 +-
 vllm_mindspore/worker/model_runner.py         |  14 +-
 vllm_mindspore/worker/worker.py               |  32 +-
 8 files changed, 339 insertions(+), 368 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
 delete mode 100644 vllm_mindspore/model_executor/models/mf_models/weight_processor.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 3dac7aca..e23fe457 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -232,12 +232,14 @@ V0Worker.init_device = wrapper_worker_init_device(V0Worker.init_device)
 from vllm_mindspore.worker.model_runner import (
     _get_cuda_graph_pad_size,
     _dummy_run,
+    profile_run,
     _get_supported_attention_backends,
 )
 
 vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = (
     _get_cuda_graph_pad_size)
 vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run
+vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run
 
 import vllm.worker.multi_step_model_runner
 
diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
index 76e46a29..f0ad5642 100644
--- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py
+++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py
@@ -30,7 +30,7 @@ import vllm.envs as envs
 from vllm_mindspore.lora.ops.torch_ops.lora_ops import (
     bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand,
     sgmv_expand_slice, sgmv_shrink, einsum_ms)
-from vllm_mindspore.model_executor.utils import get_model_context
+from vllm_mindspore.model_executor.utils import get_model_context, set_model_context
 
 
 # The platforms that are compatible with the PyTorch-native implementation can
@@ -52,11 +52,11 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
         self.lora_indices = Parameter(initializer("ones", self.max_loras, dtype.int64), name="lora_indices")
 
     def sgmv_shrink(self,
-                        inputs,
-                        lora_a_weights,
-                        group_list,
-                        scaling,
-                        ):
+                    inputs,
+                    lora_a_weights,
+                    group_list,
+                    scaling,
+                    ):
         outputs = grouped_matmul_v4([inputs], [lora_a_weights],
                                     group_list=group_list,
                                     split_item=3,
@@ -117,6 +117,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
         new_tensor = ops.zeros(self.max_loras, dtype=self.group_list.dtype)
         new_tensor[lora_indices] = seq_len
         self.group_list.set_data(new_tensor.astype(dtype.int64))
+        set_model_context("no_lora", self.no_lora)
 
     def add_lora_embedding(self,
                            y,
@@ -196,7 +197,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell):
                   lora_bias_stacked,
                   scale,
                   **kwargs):
-        if self.no_lora:
+        if get_model_context("no_lora"):
             return y
         x = x.reshape(-1, x.shape[-1])
         orign_shape = y.shape
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
new file mode 100644
index 00000000..20969730
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2025 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import abstractmethod
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import mindspore as ms
+from mindformers.core.context import build_mf_context
+from mindformers.core.parallel_config import build_parallel_config
+from mindformers.tools.register.config import MindFormerConfig
+from mindformers.tools.utils import is_pynative
+from mindspore import Tensor, nn
+from mindspore.common.api import _pynative_executor
+from mindspore.communication import get_rank
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_dp_group
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from vllm_mindspore.model_executor.models.attention_mask import (
+    LowerTriangularMask)
+from vllm_mindspore.model_executor.models.model_base import MsModelBase
+from vllm_mindspore.model_executor.models.utils import is_use_ringmla
+
+try:
+    # Need to apply dllm pd patch on vllm to use pd disagg related functions
+    from vllm.attention.layer import (maybe_save_kv_layer_to_connector,
+                                      wait_for_kv_layer_from_connector)
+    from vllm.distributed.kv_transfer import is_v1_kv_transfer_group
+    kv_transfer_supported = True
+except:  # noqa: E722
+    kv_transfer_supported = False
+
+logger = init_logger(__name__)
+
+
+class MfModelBase(MsModelBase):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        model_config_path = os.getenv("MINDFORMERS_MODEL_CONFIG")
+        if model_config_path is None:
+            raise RuntimeError('For "MindFormers" model backend, environments '
+                               'MINDFORMERS_MODEL_CONFIG should be set!')
+
+        self.mf_config = MindFormerConfig(model_config_path)
+        self.rank_id = get_rank()
+        self.dp_size = get_dp_group()
+
+        self.kv_transfer_config = vllm_config.kv_transfer_config
+        build_mf_context(self.mf_config)
+        build_parallel_config(self.mf_config)
+        self.mf_config.model.model_config.parallel_config = (
+            self.mf_config.parallel_config)
+        self.mf_config.model.model_config.parallel_config.model_parallel = (
+            get_tensor_model_parallel_world_size())
+        self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
+        self.use_ringmla = is_use_ringmla(vllm_config, self.mf_config)
+        self.is_chunked = False
+        self._generate_model_config()
+        if not hasattr(self, 'mf_model_config'):
+            raise RuntimeError('mf_model_config not initialized')
+        self.casual_mask = LowerTriangularMask(
+            dtype=self.mf_model_config.compute_dtype,
+            max_model_len=self.model_config.max_model_len)
+        self.network, self.lm_head = self._create_network()
+
+        affinity_config = self.mf_config.get('context',
+                                             {}).get('affinity_cpu_list', {})
+        if isinstance(affinity_config, dict):
+            ms.runtime.set_cpu_affinity(True, affinity_config)
+
+        self._set_dynamic_inputs()
+
+    @property
+    def ready_lm_head(self) -> nn.Cell:
+        if self.lm_head is None:
+            raise RuntimeError("lm_head not initialized")
+        return self.lm_head
+
+    @abstractmethod
+    def _generate_model_config(self):
+        raise NotImplementedError(
+            "Function _generate_model_config should be Implemented!")
+
+    @abstractmethod
+    def _create_network(self):
+        raise NotImplementedError(
+            "Function _create_network should be Implemented!")
+
+    # DLLM
+    def is_decoder_task(self) -> bool:
+        if self.kv_transfer_config is None:
+            return False
+
+        return self.kv_transfer_config.is_kv_consumer
+
+    # DLLM
+    def is_prefill_task(self) -> bool:
+        if self.kv_transfer_config is None:
+            return False
+
+        return self.kv_transfer_config.is_kv_producer
+
+    def _set_dynamic_inputs(self):
+        self.network.set_dynamic_inputs()
+        if not hasattr(self, 'mf_model_config'):
+            raise RuntimeError('mf_model_config not initialized')
+        dynamic_hidden_states = Tensor(
+            shape=[None, None], dtype=self.mf_model_config.compute_dtype)
+        self.ready_lm_head.set_inputs(dynamic_hidden_states)
+
+    def prepare_inputs(self, input_ids, positions):
+        return self.prepare_base_inputs(input_ids, positions)
+
+    def update_model_inputs(self, model_inputs, **kwargs):
+        return model_inputs
+
+    # DLLM
+    def connector_send_kvcache(self):
+        logger.debug("reached connector_send_kvcache")
+        _pynative_executor.sync()
+        forward_context = get_forward_context()
+        if not hasattr(self, 'mf_model_config'):
+            raise RuntimeError('mf_model_config not initialized')
+        for i in range(self.mf_model_config.num_layers):
+            kv_cache = self.kv_caches[i]
+            k_cache = kv_cache.kv_cache[forward_context.virtual_engine][0]
+            v_cache = kv_cache.kv_cache[forward_context.virtual_engine][1]
+            maybe_save_kv_layer_to_connector(str(i), (k_cache, v_cache))
+
+    # DLLM
+    def connector_wait_for_kv_layer(self):
+        logger.debug("reached connector_wait_for_kv_layer")
+        if not hasattr(self, 'mf_model_config'):
+            raise RuntimeError('mf_model_config not initialized')
+        for i in range(self.mf_model_config.num_layers):
+            wait_for_kv_layer_from_connector("key." + str(i))
+
+    def forward(self,
+                input_ids: Tensor,
+                positions: Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[Tensor] = None,
+                **kwargs) -> Union[Tensor, IntermediateTensors]:
+        model_inputs, is_prefill = self.prepare_inputs(input_ids, positions)
+        model_inputs = self.update_model_inputs(model_inputs, **kwargs)
+
+        if is_prefill:
+            self.network.phase = "prefill"
+            if not self.set_flags or is_pynative():
+                self.network.add_flags_custom(is_first_iteration=True)
+            hidden_states = self.network(**model_inputs)
+            self.network.phase = "increment"
+            if not self.set_flags or is_pynative():
+                self.network.add_flags_custom(is_first_iteration=False)
+                self.set_flags = True
+            if kv_transfer_supported and is_v1_kv_transfer_group():
+                self.connector_send_kvcache()
+        # DLLM
+        else:
+            if kv_transfer_supported:
+                if is_v1_kv_transfer_group() and self.is_prefill_task():
+                    self.connector_send_kvcache()
+
+                if is_v1_kv_transfer_group() and self.is_decoder_task():
+                    self.connector_wait_for_kv_layer()
+                    logger.debug("connector_wait_for_kv_layer success")
+            hidden_states = self.network(**model_inputs)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[Tensor]:
+        if sampling_metadata is not None:
+            selected_token_indices = sampling_metadata.selected_token_indices
+            if (selected_token_indices is not None
+                    and selected_token_indices.numel() <= 0):
+                if not hasattr(self, 'mf_model_config'):
+                    raise RuntimeError('mf_model_config not initialized')
+                logits = ms.mint.zeros(
+                    (0, self.mf_model_config.vocab_size),
+                    dtype=self.mf_model_config.compute_dtype)
+            else:
+                hidden_states = hidden_states.index_select(
+                    0, selected_token_indices)
+                logits = self.ready_lm_head(hidden_states)
+                logits = logits.view(-1, logits.shape[-1])
+        else:
+            logits = self.ready_lm_head(hidden_states)
+            logits = logits.view(-1, logits.shape[-1])
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> set[str]:
+        raise NotImplementedError("load_weight not implemented.")
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
deleted file mode 100644
index 5036323c..00000000
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Copyright 2025 Huawei Technologies Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-transform huggingface safetensor.
-"""
-
-import os
-from enum import Enum
-
-from mindformers.parallel_core.inference import parallel_state as ps
-from mindspore.communication.management import get_group_size, get_rank
-from safetensors import safe_open
-from vllm_mindspore.utils import is_310p
-
-class EPMethod(Enum):
-    """
-    EP method enums
-    """
-    DEFAULT = 'default'
-    ALLTOALL = 'alltoall'
-    ALLGATHER = 'allgather'
-
-
-class BaseWeightProcessor:
-    r"""
-    Provide model weight load and shards.
-    Args:
-        config (MF Config): The config of Infer model.
-        network (InferenceModelForCausalLM): The network of infer model.
-
-    """
-
-    def __init__(self, config, network, is_quant, vllm_config):
-        self.vllm_config = vllm_config
-        self.is_310p = is_310p()
-        self.config = config
-        self.network = network
-        self.is_quant = is_quant
-        self.global_rank_id = get_rank()
-        self.global_group_size = get_group_size()
-        self.tp_group_size = ps.get_tensor_model_parallel_world_size()
-        self.dp_group_size = ps.get_data_parallel_world_size()
-        self.num_router_experts = self.config.moe_config.expert_num if \
-                self.config.moe_config.expert_num else 1
-        self.moe_ep_size = ps.get_moe_expert_parallel_world_size()
-        self.moe_tp_size = ps.get_moe_tensor_parallel_world_size()
-        self.tp_dp_size = ps.get_tensor_and_data_parallel_world_size()
-        self.ep_method = EPMethod.DEFAULT
-        if self.dp_group_size > 1\
-                and self.moe_ep_size == self.global_group_size:
-            self.ep_method = EPMethod.ALLTOALL
-        elif self.dp_group_size > 1:
-            self.ep_method = EPMethod.ALLGATHER
-        self.tp_rank_id = ps.get_tensor_model_parallel_rank()
-        self.tp_dp_rank_id = ps.get_tensor_and_data_parallel_rank()
-
-        self.ep_group_nums = self.num_router_experts // self.moe_ep_size
-        self.moe_ep_rank_id = ps.get_moe_expert_parallel_rank()
-        self.moe_tp_rank_id = ps.get_moe_tensor_parallel_rank()
-        self.ep_start = self.moe_ep_rank_id * self.ep_group_nums
-        self.ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums
-
-        self.parameter_dict = {}
-        self.file_handles = {}
-
-    def get_file_handles(self, filename):
-        if filename not in self.file_handles:
-            fp = safe_open(filename, framework="np")
-            self.file_handles[filename] = fp
-        return self.file_handles[filename]
-
-    def release_file_handles(self):
-        del self.file_handles
-
-    def get_safetensor_from_file(self, hf_param_name, src_hf_dir,
-                                 hf_weight_map):
-        safetensor_file = hf_weight_map[hf_param_name]
-        filename = os.path.join(src_hf_dir, safetensor_file)
-        sf_file = self.get_file_handles(filename)
-        qint4 = False
-        if sf_file.metadata(
-        ) is not None and hf_param_name in sf_file.metadata():
-            qint4 = True
-
-        np_data = sf_file.get_tensor(hf_param_name)
-        return np_data, qint4
-
-    def get_safetensor_from_file_split_tp_group(self,
-                                                hf_param_name,
-                                                src_hf_dir,
-                                                hf_weight_map,
-                                                split_axis=0):
-        safetensor_file = hf_weight_map[hf_param_name]
-        filename = os.path.join(src_hf_dir, safetensor_file)
-        sf_file = self.get_file_handles(filename)
-        qint4 = False
-        if sf_file.metadata(
-        ) is not None and hf_param_name in sf_file.metadata():
-            qint4 = True
-
-        np_data = sf_file.get_slice(hf_param_name)
-        shape = np_data.get_shape()
-        if split_axis == 0:
-            split_size = shape[0] // self.tp_group_size
-            start = self.tp_rank_id * split_size
-            stop = (self.tp_rank_id + 1) * split_size
-            split_data = np_data[start:stop]
-        elif split_axis == 1:
-            split_size = shape[1] // self.tp_group_size
-            start = self.tp_rank_id * split_size
-            stop = (self.tp_rank_id + 1) * split_size
-            split_data = np_data[:, start:stop]
-        elif split_axis == 2:
-            split_size = shape[2] // self.tp_group_size
-            start = self.tp_rank_id * split_size
-            stop = (self.tp_rank_id + 1) * split_size
-            split_data = np_data[:, :, start:stop]
-        else:
-            raise ValueError(
-                "split_axis:{} is not supported.".format(split_axis))
-        return split_data, qint4
-
-    def get_safetensor_from_file_split_tpdp_group(self,
-                                                  hf_param_name,
-                                                  src_hf_dir,
-                                                  hf_weight_map,
-                                                  split_axis=0):
-        safetensor_file = hf_weight_map[hf_param_name]
-        filename = os.path.join(src_hf_dir, safetensor_file)
-        sf_file = self.get_file_handles(filename)
-        qint4 = False
-        if sf_file.metadata(
-        ) is not None and hf_param_name in sf_file.metadata():
-            qint4 = True
-
-        np_data = sf_file.get_slice(hf_param_name)
-        shape = np_data.get_shape()
-        if split_axis == 0:
-            split_size = shape[0] // self.tp_dp_size
-            start = self.tp_dp_rank_id * split_size
-            stop = (self.tp_dp_rank_id + 1) * split_size
-            split_data = np_data[start:stop]
-        elif split_axis == 1:
-            split_size = shape[1] // self.tp_dp_size
-            start = self.tp_dp_rank_id * split_size
-            stop = (self.tp_dp_rank_id + 1) * split_size
-            split_data = np_data[:, start:stop]
-        elif split_axis == 2:
-            split_size = shape[2] // self.tp_dp_size
-            start = self.tp_dp_rank_id * split_size
-            stop = (self.tp_dp_rank_id + 1) * split_size
-            split_data = np_data[:, :, start:stop]
-        else:
-            raise ValueError(
-                "split_axis:{} is not supported.".format(split_axis))
-        return split_data, qint4
-
-    def get_safetensor_from_file_split_global_group(self,
-                                                    hf_param_name,
-                                                    src_hf_dir,
-                                                    hf_weight_map,
-                                                    split_axis=0):
-        safetensor_file = hf_weight_map[hf_param_name]
-        filename = os.path.join(src_hf_dir, safetensor_file)
-        sf_file = self.get_file_handles(filename)
-        qint4 = False
-        if sf_file.metadata(
-        ) is not None and hf_param_name in sf_file.metadata():
-            qint4 = True
-
-        np_data = sf_file.get_slice(hf_param_name)
-        shape = np_data.get_shape()
-        if split_axis == 0:
-            split_size = shape[0] // self.global_group_size
-            start = self.global_rank_id * split_size
-            stop = (self.global_rank_id + 1) * split_size
-            split_data = np_data[start:stop]
-        elif split_axis == 1:
-            split_size = shape[1] // self.global_group_size
-            start = self.global_rank_id * split_size
-            stop = (self.global_rank_id + 1) * split_size
-            split_data = np_data[:, start:stop]
-        elif split_axis == 2:
-            split_size = shape[2] // self.global_group_size
-            start = self.global_rank_id * split_size
-            stop = (self.global_rank_id + 1) * split_size
-            split_data = np_data[:, :, start:stop]
-        else:
-            raise ValueError(
-                "split_axis:{} is not supported.".format(split_axis))
-
-        return split_data, qint4
-
-    def get_safetensor_from_file_split_moe_tp_group(self,
-                                                    hf_param_name,
-                                                    src_hf_dir,
-                                                    hf_weight_map,
-                                                    split_axis=0):
-        safetensor_file = hf_weight_map[hf_param_name]
-        filename = os.path.join(src_hf_dir, safetensor_file)
-        sf_file = self.get_file_handles(filename)
-        qint4 = False
-        if sf_file.metadata(
-        ) is not None and hf_param_name in sf_file.metadata():
-            qint4 = True
-
-        np_data = sf_file.get_slice(hf_param_name)
-        shape = np_data.get_shape()
-        if split_axis == 0:
-            split_size = shape[0] // self.moe_tp_size
-            start = self.moe_tp_rank_id * split_size
-            stop = (self.moe_tp_rank_id + 1) * split_size
-            split_data = np_data[start:stop]
-        elif split_axis == 1:
-            split_size = shape[1] // self.moe_tp_size
-            start = self.moe_tp_rank_id * split_size
-            stop = (self.moe_tp_rank_id + 1) * split_size
-            split_data = np_data[:, start:stop]
-        else:
-            raise ValueError(
-                "split_axis:{} is not supported.".format(split_axis))
-
-        return split_data, qint4
-
-    def get_routed_safetensor_3_dim(self,
-                                    hf_param_name,
-                                    src_hf_dir,
-                                    hf_weight_map,
-                                    split_ep=False,
-                                    split_tp=False,
-                                    tp_axis=-1):
-        '''get_routed_safetensor_3_dim'''
-        safetensor_file = hf_weight_map[hf_param_name]
-        filename = os.path.join(src_hf_dir, safetensor_file)
-        sf_file = self.get_file_handles(filename)
-        qint4 = False
-        if sf_file.metadata(
-        ) is not None and hf_param_name in sf_file.metadata():
-            qint4 = True
-        if not split_tp and not split_ep:
-            np_data = sf_file.get_tensor(hf_param_name)
-            return np_data, qint4
-
-        np_data = sf_file.get_slice(hf_param_name)
-        if not split_tp and split_ep:
-            split_data = np_data[self.ep_start:self.ep_stop, :, :]
-            return split_data, qint4
-
-        shape = np_data.get_shape()
-        if tp_axis == 1:
-            split_size = shape[1] // self.moe_tp_size
-            start = self.moe_tp_rank_id * split_size
-            stop = (self.moe_tp_rank_id + 1) * split_size
-            split_data = np_data[
-                self.ep_start:self.ep_stop,
-                start:stop, :] if split_ep else np_data[:, start:stop, :]
-        elif tp_axis == 2:
-            split_size = shape[2] // self.moe_tp_size
-            start = self.moe_tp_rank_id * split_size
-            stop = (self.moe_tp_rank_id + 1) * split_size
-            split_data = np_data[
-                self.ep_start:self.ep_stop, :,
-                start:stop] if split_ep else np_data[:, :, start:stop]
-        else:
-            raise ValueError("tp_axis:{} is not supported.".format(tp_axis))
-        return split_data, qint4
-
-    def get_routed_safetensor_2_dim(self,
-                                    hf_param_name,
-                                    src_hf_dir,
-                                    hf_weight_map,
-                                    split_ep=False,
-                                    split_tp=False,
-                                    tp_axis=-1):
-        '''get_moe_routed_safetensor_2_dim'''
-        safetensor_file = hf_weight_map[hf_param_name]
-        filename = os.path.join(src_hf_dir, safetensor_file)
-        sf_file = self.get_file_handles(filename)
-        qint4 = False
-        if sf_file.metadata(
-        ) is not None and hf_param_name in sf_file.metadata():
-            qint4 = True
-        if not split_tp and not split_ep:
-            np_data = sf_file.get_tensor(hf_param_name)
-            return np_data, qint4
-
-        np_data = sf_file.get_slice(hf_param_name)
-        if not split_tp and split_ep:
-            split_data = np_data[self.ep_start:self.ep_stop, :]
-            return split_data, qint4
-
-        shape = np_data.get_shape()
-        if tp_axis == 1:
-            split_size = shape[1] // self.moe_tp_size
-            start = self.moe_tp_rank_id * split_size
-            stop = (self.moe_tp_rank_id + 1) * split_size
-            split_data = np_data[
-                self.ep_start:self.ep_stop,
-                start:stop] if split_ep else np_data[:, start:stop]
-        else:
-            raise ValueError(
-                "split_tp is True but tp_axis:{} is not supported.".format(
-                    tp_axis))
-        return split_data, qint4
-
-    def split_weight_by_rank(self, weight, split_axis=0):
-        if self.tp_group_size == 1:
-            return weight
-
-        shape = weight.shape
-        if split_axis == 0:
-            split_size = shape[0] // self.tp_group_size
-            start = self.tp_rank_id * split_size
-            stop = (self.tp_rank_id + 1) * split_size
-            split_data = weight[start:stop]
-        elif split_axis == 1:
-            split_size = shape[1] // self.tp_group_size
-            start = self.tp_rank_id * split_size
-            stop = (self.tp_rank_id + 1) * split_size
-            split_data = weight[:, start:stop]
-        else:
-            raise ValueError(
-                "split_axis:{} is not supported.".format(split_axis))
-        return split_data
-
-    def load_safetensors_shard(self, src_hf_dir):
-        """ load safetensors and shards """
-        raise NotImplementedError(
-            "load_safetensors_shard method is not implemented.")
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index b149e94b..2ce557d6 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -33,7 +33,7 @@ from vllm.sequence import IntermediateTensors
 from vllm_mindspore.model_executor.models.attention_mask import (
     LowerTriangularMask)
 from vllm_mindspore.model_executor.models.utils import is_use_ringmla
-from vllm_mindspore.model_executor.utils import set_model_context
+from vllm_mindspore.model_executor.utils import set_model_context, get_model_context
 from vllm_mindspore.utils import FORMAT_TYPE, STR_DTYPE_TO_MS_DTYPE, is_310p, create_kv_cache
 
 from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata
@@ -418,7 +418,9 @@ class NativeModel(MsModelBase):
         #    vllm_config.model_config.enforce_eager = True
         self.is_eager_mode = vllm_config.model_config.enforce_eager
         self.prefill_graph = None
+        self.lora_prefill_graph = None
         self.decode_graph = None
+        self.lora_decode_graph = None
 
     @property
     def ready_model(self) -> nn.Cell:
@@ -561,23 +563,79 @@ class NativeModel(MsModelBase):
 
         # graph mode
         if is_prefill:
-            self.model.phase = "prefill"
-            if self.prefill_graph is None:
+            # self.model.phase = "prefill"
+            # if self.prefill_graph is None:
+            #     set_model_context("is_prefill", True)
+            #     self.model._set_jit_graph_name("prefill")
+            #     self.set_model_inputs(input_ids, positions,
+            #                           intermediate_tensors, inputs_embeds)
+            #     self.prefill_graph = ms.jit(function=self.model,
+            #                                 jit_level="O0")
+            # model_output = self.prefill_graph(**model_inputs)
+            if self.prefill_graph is None or self.lora_prefill_graph is None:
                 set_model_context("is_prefill", True)
+                if get_model_context("no_lora"):
+                    self.model.phase = "prefill"
+                    self.model._set_jit_graph_name("prefill")
+                    self.set_model_inputs(input_ids, positions,
+                                      intermediate_tensors, inputs_embeds)
+                    self.prefill_graph = ms.jit(function=self.model,
+                                                jit_level="O0")
+                else:
+                    self.model.phase = "prefill_lora"
+                    self.model._set_jit_graph_name("prefill_lora")
+                    self.set_model_inputs(input_ids, positions,
+                                      intermediate_tensors, inputs_embeds)
+                    self.lora_prefill_graph = ms.jit(function=self.model,
+                                                jit_level="O0")
+            if get_model_context("no_lora"):
+                self.model.phase = "prefill"
                 self.model._set_jit_graph_name("prefill")
                 self.set_model_inputs(input_ids, positions,
-                                      intermediate_tensors, inputs_embeds)
-                self.prefill_graph = ms.jit(function=self.model,
-                                            jit_level="O0")
-            model_output = self.prefill_graph(**model_inputs)
+                                    intermediate_tensors, inputs_embeds)
+                model_output = self.prefill_graph(**model_inputs)
+            else:
+                self.model.phase = "prefill_lora"
+                self.model._set_jit_graph_name("prefill_lora")
+                self.set_model_inputs(input_ids, positions,
+                                    intermediate_tensors, inputs_embeds)
+                model_output = self.lora_prefill_graph(**model_inputs)
         else:
-            self.model.phase = "increment"
-            if self.decode_graph is None:
+            # self.model.phase = "increment"
+            # if self.decode_graph is None:
+            #     set_model_context("is_prefill", False)
+            #     self.model._set_jit_graph_name("decode")
+            #     self.set_model_inputs(input_ids, positions,
+            #                           intermediate_tensors, inputs_embeds)
+            #     self.decode_graph = ms.jit(function=self.model, jit_level="O0")
+            # model_output = self.decode_graph(**model_inputs)
+            if self.decode_graph is None or self.lora_decode_graph is None:
                 set_model_context("is_prefill", False)
+                if get_model_context("no_lora"):
+                    self.model.phase = "increment"
+                    self.model._set_jit_graph_name("decode")
+                    self.set_model_inputs(input_ids, positions,
+                                          intermediate_tensors, inputs_embeds)
+                    self.decode_graph = ms.jit(function=self.model, jit_level="O0")
+                else:
+                    self.model.phase = "increment_lora"
+                    self.model._set_jit_graph_name("decode_lora")
+                    self.set_model_inputs(input_ids, positions,
+                                      intermediate_tensors, inputs_embeds)
+                    self.lora_decode_graph = ms.jit(function=self.model,
+                                                jit_level="O0")
+            if get_model_context("no_lora"):
+                self.model.phase = "increment"
                 self.model._set_jit_graph_name("decode")
                 self.set_model_inputs(input_ids, positions,
-                                      intermediate_tensors, inputs_embeds)
-                self.decode_graph = ms.jit(function=self.model, jit_level="O0")
-            model_output = self.decode_graph(**model_inputs)
+                                        intermediate_tensors, inputs_embeds)
+                model_output = self.decode_graph(**model_inputs)
+            else:
+                self.model.phase = "increment_lora"
+                self.model._set_jit_graph_name("decode_lora")
+                self.set_model_inputs(input_ids, positions,
+                                    intermediate_tensors, inputs_embeds)
+                model_output = self.lora_decode_graph(**model_inputs)
+
 
         return model_output
diff --git a/vllm_mindspore/model_executor/utils.py b/vllm_mindspore/model_executor/utils.py
index b7187afe..3752ed69 100644
--- a/vllm_mindspore/model_executor/utils.py
+++ b/vllm_mindspore/model_executor/utils.py
@@ -33,7 +33,8 @@ def set_weight_attrs(
         setattr(weight, key, value)
 
 
-_native_model_context = {"is_prefill": True}
+_native_model_context = {"is_prefill": True,
+                         "no_lora": True}
 
 
 def set_model_context(key, value):
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index 1c37be98..ca3199ba 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -42,10 +42,18 @@ def _get_cuda_graph_pad_size(self,
     # No need to use cuda graph for mindspore.
     return -1
 
+def profile_run(self) -> None:
+    max_num_batched_tokens = \
+        self.scheduler_config.max_num_batched_tokens
+    max_num_seqs = self.scheduler_config.max_num_seqs
+    self._dummy_run(max_num_batched_tokens, max_num_seqs)
+    if self.lora_config:
+        self._dummy_run(max_num_batched_tokens, max_num_seqs, True)
 
 def _dummy_run(self,
                max_num_batched_tokens: int,
-               max_num_seqs: int = 1) -> None:
+               max_num_seqs: int = 1,
+               use_lora: bool = False) -> None:
     with self.set_in_profile_run():
         # Enable top-k sampling to reflect the accurate memory usage.
         sampling_params = \
@@ -57,7 +65,7 @@ def _dummy_run(self,
         # passed in, which contains a lora from the lora warmup path.
         dummy_lora_requests: List[LoRARequest] = []
         dummy_lora_requests_per_seq: List[LoRARequest] = []
-        if self.lora_config:
+        if use_lora:
             assert self.lora_manager is not None
             with self.lora_manager.dummy_lora_cache():
                 for idx in range(self.lora_config.max_loras):
@@ -172,7 +180,7 @@ def _dummy_run(self,
 
         self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
-        if self.lora_config:
+        if use_lora:
             # Remove dummy loras.
             assert self.lora_manager is not None
             self.remove_all_loras()
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 81db9bc4..f1d32c94 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -15,14 +15,15 @@
 # limitations under the License.
 """Adapted functions for mindspore in Worker."""
 
+from typing import List
 import math
 import subprocess
 import os
-import subprocess
 
 import psutil
 import torch
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceGroupMetadata
@@ -172,7 +173,8 @@ def _prepare_input_for_warmup(model_config,
                               model_runner,
                               cache_engine,
                               is_prefill,
-                              is_mtp_model=False):
+                              is_mtp_model=False,
+                              use_lora=False):
     bs = 1
     seq_len = model_runner.scheduler_config.max_num_batched_tokens \
         if is_prefill else 1
@@ -182,6 +184,20 @@ def _prepare_input_for_warmup(model_config,
         i for i in range(math.ceil(seq_len / cache_engine.block_size))
     ]
 
+    dummy_lora_requests: List[LoRARequest] = []
+    if use_lora:
+        assert model_runner.lora_manager is not None
+        LORA_WARMUP_RANK = 8
+        with model_runner.lora_manager.dummy_lora_cache():
+            dummy_lora_request = LoRARequest(
+                lora_name=f"warmup_for_decode",
+                lora_int_id=1,
+                lora_path="/not/a/real/path",
+            )
+            model_runner.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                rank=LORA_WARMUP_RANK)
+            dummy_lora_requests = dummy_lora_request
+
     # adapter multi modal warm up
     seq_data = dummy_data.seq_data
     if seq_len == 1:
@@ -194,7 +210,7 @@ def _prepare_input_for_warmup(model_config,
             seq_data={idx: seq_data},
             sampling_params=SamplingParams(),
             block_tables={idx: block_tables_num},
-            lora_request=None,
+            lora_request=dummy_lora_requests if use_lora else None,
             multi_modal_data=None,
             multi_modal_placeholders=None,
         ) for idx in range(bs)
@@ -243,6 +259,16 @@ def _warm_up_model(self) -> None:
 
     torch.cuda.synchronize()
 
+    if self.vllm_config.lora_config is not None:
+        # warmup for lora prefill
+        #model_input, _ = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], True, False, True)
+        #self.model_runner.execute_model(model_input, kv_cache, None)
+        #torch.cuda.synchronize()
+        # warmup for lora decode
+        model_input, _ = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], False, False, True)
+        self.model_runner.execute_model(model_input, kv_cache, None)
+        torch.cuda.synchronize()
+
     # Reset the seed to ensure that the random state is not affected by
     # the model initialization and profiling.
     set_random_seed(self.model_config.seed)
-- 
Gitee