From 1baa59e16bfce151630381ed514159e7b1d61843 Mon Sep 17 00:00:00 2001 From: HighCloud Date: Fri, 4 Jul 2025 15:04:51 +0800 Subject: [PATCH 01/14] support native qwq --- vllm_mindspore/__init__.py | 10 ++ .../distributed/communication_op.py | 10 ++ vllm_mindspore/distributed/parallel_state.py | 93 +++++++++++++++++++ .../model_executor/layers/linear.py | 1 + .../model_loader/weight_utils.py | 13 +-- .../model_executor/models/model_base.py | 55 +++++++++-- vllm_mindspore/model_executor/models/qwen2.py | 30 +++++- vllm_mindspore/utils.py | 93 +++++++++++++++++++ vllm_mindspore/v1/worker/gpu_model_runner.py | 14 ++- vllm_mindspore/worker/cache_engine.py | 18 +++- vllm_mindspore/worker/model_runner.py | 5 +- 11 files changed, 320 insertions(+), 22 deletions(-) create mode 100644 vllm_mindspore/distributed/parallel_state.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index dbd26f9b..31e84d38 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -351,6 +351,16 @@ RejectionSampler._smallest_positive_value = _smallest_positive_value RejectionSampler._smallest_positive_value.__set_name__( RejectionSampler, "_smallest_positive_value") +import vllm.distributed.communication_op +import vllm.worker.worker_base +from vllm_mindspore.distributed.communication_op import cpu_broadcast_tensor_dict +vllm.distributed.communication_op.broadcast_tensor_dict = cpu_broadcast_tensor_dict +vllm.worker.worker_base.broadcast_tensor_dict = cpu_broadcast_tensor_dict + +import vllm.distributed.parallel_state +from vllm_mindspore.distributed.parallel_state import gc_broadcast_tensor_dict +vllm.distributed.parallel_state.GroupCoordinator.broadcast_tensor_dict = gc_broadcast_tensor_dict + ######### for multi-model from vllm_mindspore.inputs.registry import call_hf_processor from vllm.inputs.registry import InputProcessingContext diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py index c933dc4a..475a282d 100644 --- a/vllm_mindspore/distributed/communication_op.py +++ b/vllm_mindspore/distributed/communication_op.py @@ -21,10 +21,20 @@ Implement a unified communication interface for both graph and pynative mode. """ +from typing import Any, Dict, Optional, Union +import torch + from mindspore import nn, ops from vllm.distributed.parallel_state import ( get_tensor_model_parallel_world_size, get_tp_group) +def cpu_broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor, + Any]]] = None, + src: int = 0): + if not torch.distributed.is_initialized(): + return tensor_dict + return get_tp_group().broadcast_tensor_dict(tensor_dict, src, group=get_tp_group().cpu_group) + class ReduceFromModelParallelRegion(nn.Cell): "All reduce the input from the model parallel region." diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py new file mode 100644 index 00000000..697196fa --- /dev/null +++ b/vllm_mindspore/distributed/parallel_state.py @@ -0,0 +1,93 @@ +import torch +import torch.distributed +from torch.distributed import ProcessGroup + +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union) +from vllm.distributed.parallel_state import _split_tensor_dict, TensorMetadata +from vllm_mindspore.utils import atlas_inference + +def gc_broadcast_tensor_dict( + self, + tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None, + src: int = 0, + group: Optional[ProcessGroup] = None, + metadata_group: Optional[ProcessGroup] = None + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + """Broadcast the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if (not torch.distributed.is_initialized() or self.world_size == 1): + return tensor_dict + + if not atlas_inference(): + group = self.device_group + metadata_group = self.cpu_group + assert src < self.world_size, f"Invalid src rank ({src})" + + rank_in_group = self.rank_in_group + if rank_in_group == src: + metadata_list: List[Tuple[Any, Any]] = [] + assert isinstance( + tensor_dict, + dict), (f"Expecting a dictionary, got {type(tensor_dict)}") + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `broadcast_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.broadcast_object(metadata_list, src=src) + async_handles = [] + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast(tensor, + src=self.ranks[src], + group=metadata_group, + async_op=True) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast(tensor, + src=self.ranks[src], + group=group, + async_op=True) + async_handles.append(handle) + for async_handle in async_handles: + async_handle.wait() + + else: + metadata_list = self.broadcast_object(None, src=src) + tensor_dict = {} + async_handles = [] + for key, value in metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, + dtype=value.dtype, + device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast( + tensor, + src=self.ranks[src], + group=metadata_group, + async_op=True) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast( + tensor, + src=self.ranks[src], + group=group, + async_op=True) + async_handles.append(handle) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + for async_handle in async_handles: + async_handle.wait() + return tensor_dict diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py index c81f0e32..fc56df74 100644 --- a/vllm_mindspore/model_executor/layers/linear.py +++ b/vllm_mindspore/model_executor/layers/linear.py @@ -607,6 +607,7 @@ class RowParallelLinear(LinearBase): def weight_loader(self, param, loaded_weight): tp_rank = get_tensor_model_parallel_rank() + param_data = param.data input_dim = getattr(param, "input_dim", None) shard_size = self.input_size_per_partition start_idx = tp_rank * shard_size diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py index 6bf2dd4c..e02de0ab 100644 --- a/vllm_mindspore/model_executor/model_loader/weight_utils.py +++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py @@ -25,6 +25,8 @@ import mindspore as ms from mindspore import Parameter from safetensors import safe_open from tqdm.auto import tqdm +from vllm_mindspore.utils import atlas_inference +import numpy as np from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT, enable_tqdm) @@ -66,12 +68,11 @@ def safetensors_weights_iterator( ): with safe_open(st_file, framework="np") as f: for name in f.keys(): # noqa: SIM118 - # Return a lightweight PySafeSlice object that uses file - # pointer offset internally to read Safetensor on demand, - # avoiding memory explosion. Actual data can be obtained - # through slicing operation like param[start:end] - param = f.get_slice(name) - yield name, param + # TODO: use slice + x = f.get_tensor(name) + x = x.astype(np.float16) \ + if (str(x.dtype) == 'bfloat16' and atlas_inference()) else x + yield name, ms.tensor(x) def default_weight_loader(param: Parameter, loaded_weight: Any) -> None: diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index f0e5621e..5fc07fd2 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -36,7 +36,7 @@ from vllm_mindspore.model_executor.models.utils import is_use_ringmla from vllm_mindspore.model_executor.utils import set_model_context from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE, create_kv_cache from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata - +from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE class AttentionWrapper: @@ -47,12 +47,33 @@ class AttentionWrapper: vllm_config.parallel_config) head_size = vllm_config.model_config.get_head_size() num_block = 0 - self.kv_shape = [num_block, block_size, num_kv_heads, head_size] - self.kv_cache = [ - (create_kv_cache(self.kv_shape, vllm_config.model_config.dtype), - create_kv_cache(self.kv_shape, vllm_config.model_config.dtype)) - for _ in range(vllm_config.parallel_config.pipeline_parallel_size) - ] + + if atlas_inference(): + self.kv_shape = [num_block, block_size, num_kv_heads * head_size] + self.kv_cache = [ + ( + ops.auto_generate.format_cast( + ms.mint.zeros( + self.kv_shape, dtype=vllm_config.model_config.dtype + ), + FORMAT_TYPE['nz'], + ), + ops.auto_generate.format_cast( + ms.mint.zeros( + self.kv_shape, dtype=vllm_config.model_config.dtype + ), + FORMAT_TYPE['nz'], + ), + ) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + else: + self.kv_shape = [num_block, block_size, num_kv_heads, head_size] + self.kv_cache = [ + (create_kv_cache(self.kv_shape, vllm_config.model_config.dtype), + create_kv_cache(self.kv_shape, vllm_config.model_config.dtype)) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] self.attn_type = AttentionType.DECODER @@ -74,7 +95,22 @@ class MLAAttentionWrapper(AttentionWrapper): if kv_cache_dtype is None: kv_cache_dtype = vllm_config.model_config.dtype self.dtype = kv_cache_dtype - if not self.use_ringmla: + self.use_mla_op = bool( + vllm_config.additional_config + and vllm_config.additional_config.get('use_mla_op') == 1) + if atlas_inference(): + self.kv_cache = [ + ( + ops.auto_generate.format_cast( + ms.mint.zeros( + self.kv_shape, dtype=vllm_config.model_config.dtype + ), + FORMAT_TYPE['nz'], + ), + ) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + elif not self.use_mla_op or not self.use_ringmla: self.kv_cache = [ ( create_kv_cache( @@ -445,7 +481,8 @@ class NativeModel(MsModelBase): block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - kv_cache_shape = (None, block_size, num_kv_heads, head_size) + kv_cache_shape = (None, block_size, num_kv_heads * head_size) if atlas_inference() \ + else (None, block_size, num_kv_heads, head_size) kv_cache_dtype = (self.model_config.dtype if self.cache_config.cache_dtype == "auto" else diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index db8e31c0..04115596 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -35,7 +35,8 @@ if TYPE_CHECKING: else: Qwen2Config = None -from mindspore import Parameter, Tensor, mint, nn +from mindspore import Parameter, Tensor, mint, nn, ops +import mindspore as ms from vllm.attention.backends.abstract import AttentionType from vllm.config import CacheConfig, VllmConfig @@ -45,6 +46,7 @@ from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.sequence import IntermediateTensors from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE from vllm_mindspore.attention import Attention from vllm_mindspore.model_executor.layers.activation import SiluAndMul from vllm_mindspore.model_executor.layers.layernorm import RMSNorm @@ -408,9 +410,35 @@ class Qwen2Model(nn.Cell): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) + # Norm type in weights may be f32 + if(loaded_weight.dtype != param.dtype): + loaded_weight = loaded_weight.to(dtype=param.dtype) weight_loader(param, loaded_weight) loaded_params.add(name) + def adjust_weight(params_dict): + if not atlas_inference(): + return + + target_keywords = [ + "qkv_proj.weight", + "o_proj.weight", + "gate_up_proj.weight", + "down_proj.weight", + # "lm_head.weight", + ] + + for name, param in params_dict.items(): + if any(name.endswith(keyword) for keyword in target_keywords): + cast_weight = ops.auto_generate.format_cast(param, FORMAT_TYPE['nz']) + ms.runtime.synchronize() + param.set_data(cast_weight) + + if atlas_inference(): + ms.runtime.synchronize() + adjust_weight(params_dict) + ms.runtime.synchronize() + return loaded_params diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index f3a9adbe..4da8b795 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -436,3 +436,96 @@ def ms_memory_profiling( result.non_torch_increase = diff_from_create.non_torch_memory result.profile_time = diff_profile.timestamp result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa + + +def view(self, *shape_or_dtype): + from mindspore._c_expression import typing + if len(shape_or_dtype) == 1 and isinstance(shape_or_dtype[0], typing.Type): + target_dtype = shape_or_dtype[0] + ori_shape = self.shape + target_shape = (-1, ) + if len(ori_shape) > 1: + target_shape = ori_shape[:-1] + target_shape + out = np.frombuffer( + self.numpy(), + torch.ops.creation._TypeDict.get(target_dtype, np.float32)) + if not out.flags.aligned: + out = np.require(out, requirements=["ALIGNED"]) + if target_dtype == ms.bfloat16: + return ms.Tensor.from_numpy(out.astype( + np.float32)).astype(target_dtype).reshape(target_shape) + return ms.Tensor.from_numpy(out).reshape(target_shape) + result = [] + if type(shape_or_dtype) is tuple: + for items in shape_or_dtype: + if not isinstance(items, int): + for item in items: + if not isinstance(item, int): + result.append(item.item()) + else: + result.append(item) + else: + result.append(items) + return ms.ops.reshape(self, result) + +def is_version_ge(current_version, base_version): + """ + return current_version >= base_version. + Check whether the current version is higher than or equal to the base version. + for current_version: 1.8.1, base_version: 1.11.0, it return False. + """ + version_split_char = '.' + if version_split_char not in base_version or version_split_char not in current_version: + raise ValueError("The version string will contain the `.`." + "For example, current_version 1.8.1, base_version: 1.11.0.") + for x, y in zip(current_version.split(version_split_char), base_version.split(version_split_char)): + if not x.isdigit() or not y.isdigit(): + continue + if int(x) != int(y): + return int(x) >= int(y) + return True + +def get_ascend_soc_version(): + """Get ascend soc version.""" + if is_version_ge(ms.__version__, "2.2.0"): + from mindspore._c_expression import MSContext + return MSContext.get_instance().get_ascend_soc_version() + ascend_chip_type = os.getenv("ASCEND_CHIP_TYPE", "UNSET") + if ascend_chip_type not in ["910a", "910b", "UNSET"]: + raise EnvironmentError(f"ASCEND_CHIP_TYPE should be in ['910a', '910b'],but get {ascend_chip_type}") + if ascend_chip_type == "UNSET": + logger.info("Environment variables need to be set manually to obtain the chip type," + "which can be set as follows: \n" + "For Atlas 800, run 'export ASCEND_CHIP_TYPE=910a' before the program runs.\n" + "For Atlas 800T A2, run 'export ASCEND_CHIP_TYPE=910b' before the program runs.\n" + "If you need to get chip information automatically, MindSpore 2.2 and above is recommended") + return ascend_chip_type + +def atlas_inference(): + device = get_ascend_soc_version() + return device in ['310p', 'ascend310p'] + +def check_ready(): + from mindspore import set_context + + # Common environment variables of predict. + set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + default_env = { + "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": + "FlashAttentionScore,PagedAttention", + } + if atlas_inference(): + default_env["MS_ENABLE_INTERNAL_BOOST"] = "off" + env_setup(default_env) + + if os.getenv("MS_MEMPOOL_BLOCK_SIZE"): + set_context( + mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB") + + if is_mindformers_model_backend(): + logger.info("Run with Mindformers backend!") + elif is_mindone_model_backend(): + logger.info("Run with MindONE backend!") + else: + logger.info("Run with native model backend!") + register_connector() diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 54fb277c..2ee11fde 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -41,9 +41,10 @@ from vllm_mindspore.model_executor.layers.rotary_embedding import ( InferMRotaryEmbedding as MRotaryEmbedding) from vllm_mindspore.model_executor.models.utils import is_use_ringmla from vllm_mindspore.utils import (create_kv_cache, get_dtype_size, - get_valid_dtype, is_310p) + get_valid_dtype, is_310p, FORMAT_TYPE) from vllm_mindspore.v1.kv_cache_interface import MLAQuantFullAttentionSpec + logger = init_logger(__name__) @@ -443,6 +444,9 @@ def _reshape_kv_cache_tensors( kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) + if atlas_inference(): + *dims, second_last, last = kv_cache_shape + kv_cache_shape = (*dims, second_last * last) try: kv_cache_stride_order = self.attn_backends[ i].get_kv_cache_stride_order() @@ -483,6 +487,14 @@ def _reshape_kv_cache_tensors( cache_block_nz = ops.auto_generate.format_cast( cache_block, 29) kv_cache_layer.append(cache_block_nz) + elif atlas_inference(): + from mindspore.common.api import _pynative_executor + cache_block_nz = ops.auto_generate.format_cast(cache_block, FORMAT_TYPE['nz']) + _pynative_executor.sync() + import gc + del cache_block + gc.collect() + kv_cache_layer.append(cache_block_nz) else: kv_cache_layer.append(cache_block) kv_caches[layer_name] = mutable(tuple(kv_cache_layer)) diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py index b57b8833..7675379c 100644 --- a/vllm_mindspore/worker/cache_engine.py +++ b/vllm_mindspore/worker/cache_engine.py @@ -22,17 +22,26 @@ # isort:skip_file import mindspore as ms -from mindspore import mutable, mint +from mindspore import mutable, mint, ops from typing import List from vllm.logger import init_logger -from vllm_mindspore.utils import MsKVCache, get_valid_dtype +from vllm_mindspore.utils import MsKVCache, get_valid_dtype, atlas_inference, FORMAT_TYPE logger = init_logger(__name__) def create_block(shape, dtype, name=None, device=None): - blocks = mint.empty(shape, dtype=dtype, device=device) + from mindspore.common.api import _pynative_executor + blocks = mint.empty(*shape, dtype=dtype, device=device) + if device == "Ascend" and atlas_inference(): + blocks_nz = ops.auto_generate.format_cast(blocks, FORMAT_TYPE['nz']) + _pynative_executor.sync() + import gc + del blocks + gc.collect() + ms.hal.empty_cache() + return blocks_nz return blocks @@ -44,6 +53,9 @@ def ms_allocate_kv_cache( """Allocates KV cache on the specified device.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) + if atlas_inference(): + *dims, second_last, last = kv_cache_shape + kv_cache_shape = (*dims, second_last * last) kv_cache: List[MsKVCache] = [] self.dtype = get_valid_dtype(self.dtype) diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py index 7fd89fc5..6ab97c1b 100644 --- a/vllm_mindspore/worker/model_runner.py +++ b/vllm_mindspore/worker/model_runner.py @@ -28,7 +28,7 @@ from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.sequence import SequenceGroupMetadata -from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE +from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE, atlas_inference logger = init_logger(__name__) @@ -140,7 +140,8 @@ def _dummy_run(self, block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - kv_shape = [0, block_size, num_kv_heads, head_size] + kv_shape = [0, block_size, num_kv_heads * head_size] if atlas_inference() else \ + [0, block_size, num_kv_heads, head_size] kv_caches = mutable([ mutable( ( -- Gitee From 11dfb2b2431d27781bfac97f25ee241171d7d65a Mon Sep 17 00:00:00 2001 From: one_east Date: Thu, 24 Jul 2025 20:31:06 +0800 Subject: [PATCH 02/14] CPU bind for 910B and 910C --- vllm_mindspore/worker/worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index cef21c02..24b1ed02 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -16,6 +16,7 @@ """Adapted functions for mindspore in Worker.""" import math +import subprocess import os import subprocess -- Gitee From 04053ec77184c93b3231d1d8d063d2a19f3e2f8a Mon Sep 17 00:00:00 2001 From: HighCloud Date: Wed, 30 Jul 2025 15:24:10 +0800 Subject: [PATCH 03/14] cpu bind support 310p --- vllm_mindspore/worker/worker.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index 24b1ed02..81db9bc4 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -159,6 +159,9 @@ def wrapper_worker_bind_cpu(fun): # Bind CPU with wrapper when workers are initializing. # Support 910B, 910C and 310P. local_rank = kwargs.get("local_rank") + parallel_config = kwargs.get("vllm_config").parallel_config + local_rank = (parallel_config.data_parallel_rank_local * + parallel_config.world_size + local_rank) bind_cpu(local_rank) fun(*arg, **kwargs) -- Gitee From 12d6427217264a940855c8ec729e2cf8906edcd6 Mon Sep 17 00:00:00 2001 From: superxf Date: Wed, 23 Jul 2025 15:28:07 +0800 Subject: [PATCH 04/14] support qwq --- vllm_mindspore/__init__.py | 40 ++++++ vllm_mindspore/config.py | 3 + vllm_mindspore/engine/arg_utils.py | 123 +++++++++++++++++- .../model_executor/layers/linear.py | 43 ++++-- .../layers/quantization/__init__.py | 49 +++++++ .../layers/quantization/base_config.py | 3 + .../quantization/smooth_quant_modelslim.py | 39 +++--- .../model_loader/default_loader.py | 99 ++++++++++++++ .../model_executor/model_loader/utils.py | 55 ++++++++ .../model_loader/weight_utils.py | 109 +++++++++++++++- vllm_mindspore/model_executor/models/qwen2.py | 5 + 11 files changed, 527 insertions(+), 41 deletions(-) create mode 100644 vllm_mindspore/model_executor/model_loader/default_loader.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 31e84d38..7073717d 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -561,6 +561,46 @@ sys.modules["vllm.entrypoints.openai.tool_parsers.deepseekv3_tool_parser"] = ( from vllm_mindspore.entrypoints.__main__ import ( patch_server_run_api_server_worker_proc, ) +from vllm_mindspore.model_executor.model_loader.utils import ( + process_weights_after_loading) + +vllm.model_executor.model_loader.utils.process_weights_after_loading = ( + process_weights_after_loading) +vllm.model_executor.model_loader.base_loader.process_weights_after_loading = ( + process_weights_after_loading) + +from vllm_mindspore.model_executor.layers.quantization import ( + get_quantization_config) + +vllm.model_executor.layers.quantization.get_quantization_config = ( + get_quantization_config) +vllm.config.get_quantization_config = get_quantization_config +vllm.model_executor.model_loader.weight_utils.get_quantization_config = ( + get_quantization_config) + +from vllm_mindspore.model_executor.model_loader.weight_utils import ( + get_quant_config) + +vllm.model_executor.model_loader.weight_utils.get_quant_config = ( + get_quant_config) +vllm.config.get_quant_config = get_quant_config + +from vllm_mindspore.model_executor.layers.quantization import ( + QuantizationMethods) + +vllm.model_executor.layers.quantization.QuantizationMethods = ( + QuantizationMethods) + +from vllm_mindspore.engine.arg_utils import get_kwargs + +vllm.engine.arg_utils.get_kwargs = get_kwargs + +from vllm_mindspore.model_executor.model_loader.default_loader import ( + _prepare_weights) +from vllm.model_executor.model_loader.default_loader import DefaultModelLoader + +DefaultModelLoader._prepare_weights = _prepare_weights + patch_server_run_api_server_worker_proc() from vllm_mindspore.model_executor.models.registry import _normalize_archs diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index 7f93164b..5f7da8c5 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -264,6 +264,9 @@ def _get_and_verify_dtype( if torch_dtype in _STR_DTYPE_TO_TORCH_DTYPE: torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[torch_dtype] + if is_310p() and torch_dtype == torch.bfloat16: + return torch.float16 + return torch_dtype diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py index f7a3d6aa..9803e158 100644 --- a/vllm_mindspore/engine/arg_utils.py +++ b/vllm_mindspore/engine/arg_utils.py @@ -19,19 +19,132 @@ # limitations under the License. """Adaption for arguments utils.""" +import argparse +import json import threading -from typing import get_args +from dataclasses import MISSING, fields, is_dataclass +from typing import Any, Literal, get_origin import torch import vllm.envs as envs -from vllm.config import (GuidedDecodingBackendV1, LoadFormat, ModelConfig, - ParallelConfig, SchedulerConfig) -from vllm.engine.arg_utils import (EngineArgs, _raise_or_fallback, - _warn_or_fallback) from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext logger = init_logger(__name__) +from pydantic import TypeAdapter, ValidationError +from vllm.config import (ConfigType, GuidedDecodingBackendV1, LoadFormat, + ModelConfig, ParallelConfig, SchedulerConfig) +from vllm.engine.arg_utils import (EngineArgs, TypeHint, _raise_or_fallback, + _warn_or_fallback, contains_type, get_args, + get_attr_docs, get_type, get_type_hints, + human_readable_int, is_not_builtin, + literal_to_kwargs, optional_type, + parse_type, union_dict_and_str) + +from vllm_mindspore.model_executor.layers.quantization import ( + QUANTIZATION_METHODS) + + +def get_kwargs(cls: ConfigType) -> dict[str, Any]: + cls_docs = get_attr_docs(cls) + kwargs = {} + for field in fields(cls): + type_hints: set[TypeHint] = get_type_hints(field.type) + + # If the field is a dataclass, we can use the model_validate_json + generator = (th for th in type_hints if is_dataclass(th)) + dataclass_cls = next(generator, None) + + # Get the default value of the field + if field.default is not MISSING: + default = field.default + elif field.default_factory is not MISSING: + default = field.default_factory() + + # Get the help text for the field + name = field.name + help = cls_docs[name].strip() + # Escape % for argparse + help = help.replace("%", "%%") + + # Initialise the kwargs dictionary for the field + kwargs[name] = {"default": default, "help": help} + + # Set other kwargs based on the type hints + json_tip = """\n\nShould either be a valid JSON string or JSON keys + passed individually. For example, the following sets of arguments are + equivalent:\n\n + - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n + - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n""" + if dataclass_cls is not None: + + def parse_dataclass(val: str, cls=dataclass_cls) -> Any: + try: + if hasattr(cls, "from_cli"): + return cls.from_cli(val) + return TypeAdapter(cls).validate_json(val) + except ValidationError as e: + raise argparse.ArgumentTypeError(repr(e)) from e + + kwargs[name]["type"] = parse_dataclass + kwargs[name]["help"] += json_tip + elif contains_type(type_hints, bool): + # Creates --no- and -- flags + kwargs[name]["action"] = argparse.BooleanOptionalAction + elif contains_type(type_hints, Literal): + kwargs[name].update(literal_to_kwargs(type_hints)) + elif contains_type(type_hints, tuple): + type_hint = get_type(type_hints, tuple) + types = get_args(type_hint) + tuple_type = types[0] + assert all(t is tuple_type for t in types if t is not Ellipsis), ( + "All non-Ellipsis tuple elements must be of the same " + f"type. Got {types}.") + kwargs[name]["type"] = tuple_type + kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types) + elif contains_type(type_hints, list): + type_hint = get_type(type_hints, list) + types = get_args(type_hint) + assert len(types) == 1, ( + "List type must have exactly one type. Got " + f"{type_hint} with types {types}") + kwargs[name]["type"] = types[0] + kwargs[name]["nargs"] = "+" + elif contains_type(type_hints, int): + kwargs[name]["type"] = int + # Special case for large integers + if name in {"max_model_len", "max_num_batched_tokens"}: + kwargs[name]["type"] = human_readable_int + elif contains_type(type_hints, float): + kwargs[name]["type"] = float + elif (contains_type(type_hints, dict) + and (contains_type(type_hints, str) + or any(is_not_builtin(th) for th in type_hints))): + kwargs[name]["type"] = union_dict_and_str + elif contains_type(type_hints, dict): + kwargs[name]["type"] = parse_type(json.loads) + kwargs[name]["help"] += json_tip + elif (contains_type(type_hints, str) + or any(is_not_builtin(th) for th in type_hints)): + kwargs[name]["type"] = str + else: + raise ValueError( + f"Unsupported type {type_hints} for argument {name}.") + + # If the type hint was a sequence of literals, use the helper function + # to update the type and choices + if get_origin(kwargs[name].get("type")) is Literal: + kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]})) + + # If None is in type_hints, make the argument optional. + # But not if it's a bool, argparse will handle this better. + if type(None) in type_hints and not contains_type(type_hints, bool): + kwargs[name]["type"] = optional_type(kwargs[name]["type"]) + if kwargs[name].get("choices"): + kwargs[name]["choices"].append("None") + if field.name == "quantization": + kwargs[name]["choices"] = QUANTIZATION_METHODS + return kwargs def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py index fc56df74..ecf6298c 100644 --- a/vllm_mindspore/model_executor/layers/linear.py +++ b/vllm_mindspore/model_executor/layers/linear.py @@ -18,7 +18,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Linear methods for quantized linear layers. """ - from abc import abstractmethod from typing import Optional, Union @@ -343,14 +342,23 @@ class MergedColumnParallelLinear(ColumnParallelLinear): assert loaded_shard_id < len(self.output_sizes) shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size shard_size = self.output_sizes[loaded_shard_id] // tp_size - - start_idx = tp_rank * shard_size - loaded_weight = split_loaded_weight(loaded_weight, output_dim, - start_idx, shard_size) - - assert loaded_weight.shape == (shard_size, param.shape[1]) - param[shard_offset:shard_offset + - shard_size, :] = ms.from_numpy(loaded_weight) + param_data = param.data + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size).contiguous() + assert param_data.shape == loaded_weight.shape + if len(loaded_weight.shape) == 2: + param[shard_offset:shard_offset + + shard_size, :] = loaded_weight + else: + param[shard_offset:shard_offset + shard_size] = loaded_weight + else: + assert param.shape == loaded_weight.shape + if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16: + loaded_weight = loaded_weight.astype(ms.float16) + param.set_data(loaded_weight.contiguous()) class QKVParallelLinear(ColumnParallelLinear): @@ -433,6 +441,11 @@ class QKVParallelLinear(ColumnParallelLinear): loaded_weight, loaded_shard_id: Optional[str] = None): output_dim = getattr(param, "output_dim", None) + if output_dim is None: + if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16: + loaded_weight = loaded_weight.astype(ms.float16) + param.set_data(loaded_weight.contiguous()) + return tp_rank = get_tensor_model_parallel_rank() # QKV loaded weight is already fused on disk (qkv safetensors). @@ -482,11 +495,13 @@ class QKVParallelLinear(ColumnParallelLinear): start_idx, shard_size) loaded_weight = ms.from_numpy(loaded_weight) - if param.name.endswith("weight"): - assert loaded_weight.shape == (shard_size, param.shape[1]) - if param.name.endswith("bias"): - assert loaded_weight.shape == (shard_size, ) - param[shard_offset:shard_offset + shard_size] = loaded_weight + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size).contiguous() + assert param_data.shape == loaded_weight.shape + if len(loaded_weight.shape) == 2: + param[shard_offset:shard_offset + shard_size, :] = loaded_weight + else: + param[shard_offset:shard_offset + shard_size] = loaded_weight class RowParallelLinear(LinearBase): diff --git a/vllm_mindspore/model_executor/layers/quantization/__init__.py b/vllm_mindspore/model_executor/layers/quantization/__init__.py index e69de29b..6c9e2e41 100644 --- a/vllm_mindspore/model_executor/layers/quantization/__init__.py +++ b/vllm_mindspore/model_executor/layers/quantization/__init__.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from typing import Literal, get_args + +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + +QuantizationMethods = Literal["smoothquant"] +QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods)) + +# The customized quantization methods which will be added to this dict. +_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {} + + +def get_quantization_config(quantization: str) -> type[QuantizationConfig]: + if quantization not in QUANTIZATION_METHODS: + raise ValueError(f"Invalid quantization method: {quantization}") + + # lazy import to avoid triggering `torch.compile` too early + from .smooth_quant_modelslim import SmoothQuantModelSlimConfig + method_to_config: dict[str, type[QuantizationConfig]] = { + "smoothquant": SmoothQuantModelSlimConfig + } + # Update the `method_to_config` with customized quantization methods. + method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) + + return method_to_config[quantization] + + +__all__ = [ + "QuantizationConfig", "get_quantization_config", "QUANTIZATION_METHODS", + "QuantizationMethods" +] diff --git a/vllm_mindspore/model_executor/layers/quantization/base_config.py b/vllm_mindspore/model_executor/layers/quantization/base_config.py index 37144a43..5728702d 100644 --- a/vllm_mindspore/model_executor/layers/quantization/base_config.py +++ b/vllm_mindspore/model_executor/layers/quantization/base_config.py @@ -142,6 +142,9 @@ class QuantizationConfig(ABC): """ raise NotImplementedError + def get_cache_scale(self, name: str) -> Optional[str]: + return None + def method_has_implemented_embedding( method_class: type[QuantizeMethodBase]) -> bool: diff --git a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py index 96697046..cad2f322 100644 --- a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py +++ b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py @@ -14,12 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re from typing import Any, Optional import mindspore import numpy as np -from mindspore import Parameter, Tensor, mint +from mindspore import Parameter, Tensor, ops from mindspore.common.initializer import initializer from mindspore.ops.auto_generate import (DynamicQuantExt, GroupedMatmul, GroupedMatmulV4, QuantBatchMatmul) @@ -107,7 +106,7 @@ class SmoothQuantModelSlimConfig(QuantizationConfig): return BaseKVCacheMethod(self) if isinstance(layer, LinearBase): - if quant_config and quant_config.lower() == 'w8a8': + if quant_config and quant_config.lower() == 'w8a8s': return A8W8LinearMethod(self) if quant_config and quant_config.lower() == 'w8a8_dyn': self.dynamic_quant = True @@ -224,12 +223,12 @@ class A8W8LinearMethod(LinearMethodBase): self.params_dtype), name="input_offset") if self.is_310p: - quant_bias_ = Parameter(initializer( + quant_bias = Parameter(initializer( 'zeros', (self.output_size_per_partition // self.quant_config.pack_factor, ), mindspore.int32), - name="quant_bias_") + name="quant_bias") else: - quant_bias_ = None + quant_bias = None set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) set_weight_attrs(weight_scale, {"output_dim": 0}) @@ -240,11 +239,12 @@ class A8W8LinearMethod(LinearMethodBase): set_weight_attrs(deq_scale, extra_weight_attrs) set_weight_attrs(input_scale, extra_weight_attrs) set_weight_attrs(input_offset, extra_weight_attrs) - if quant_bias_ is not None: - set_weight_attrs(quant_bias_, extra_weight_attrs) - layer.insert_param_to_cell("quant_bias_", quant_bias_) + if quant_bias is not None: + set_weight_attrs(quant_bias, extra_weight_attrs) + set_weight_attrs(quant_bias, {"output_dim": 0}) + layer.insert_param_to_cell("quant_bias", quant_bias) else: - layer.quant_bias_ = None + layer.quant_bias = None layer.insert_param_to_cell("weight", weight) layer.insert_param_to_cell("weight_scale", weight_scale) @@ -285,7 +285,7 @@ class A8W8LinearMethod(LinearMethodBase): input_offset = Parameter(initializer('zeros', input_scale_shape, self.params_dtype), name="input_offset") - quant_bias_ = None + quant_bias = None set_weight_attrs(weight, { "ep_dim": 0, "input_dim": 1, @@ -300,11 +300,11 @@ class A8W8LinearMethod(LinearMethodBase): set_weight_attrs(deq_scale, extra_weight_attrs) set_weight_attrs(input_scale, extra_weight_attrs) set_weight_attrs(input_offset, extra_weight_attrs) - if quant_bias_ is not None: - set_weight_attrs(quant_bias_, extra_weight_attrs) - layer.insert_param_to_cell("quant_bias_", quant_bias_) + if quant_bias is not None: + set_weight_attrs(quant_bias, extra_weight_attrs) + layer.insert_param_to_cell("quant_bias", quant_bias) else: - layer.quant_bias_ = None + layer.quant_bias = None layer.insert_param_to_cell("weight", weight) layer.insert_param_to_cell("weight_scale", weight_scale) @@ -313,8 +313,7 @@ class A8W8LinearMethod(LinearMethodBase): layer.insert_param_to_cell("input_offset", input_offset) def process_weights_after_loading(self, layer: mindspore.nn.Cell) -> None: - input_offset = np.array([0]) - params_dtype = layer.params_dtype + input_offset = layer.input_offset.asnumpy() layer.input_offset = Parameter(Tensor(input_offset, dtype=mindspore.int8), name=layer.input_offset.name) @@ -335,7 +334,7 @@ class A8W8LinearMethod(LinearMethodBase): layer.weight_scale = Parameter(Tensor( weight_scale, dtype=layer.weight_scale.dtype), name=layer.weight_scale.name) - if not self.is_310p and params_dtype is mindspore.bfloat16: + if not self.is_310p and self.params_dtype is mindspore.bfloat16: deq_scale = layer.deq_scale.asnumpy().astype(np.int32).view( np.float32) layer.deq_scale = Parameter(Tensor(deq_scale, @@ -373,10 +372,10 @@ class A8W8LinearMethod(LinearMethodBase): group_type=0, group_list_type=0 if cumsum_flag else 1)[0] else: - qx = self.matmul(qx, weight, deq_scale, None, layer.quant_bias_, + qx = self.matmul(qx, weight, deq_scale, None, layer.quant_bias, None) if bias is not None: - qx = mint.add(qx, bias) + qx = self.bias_add(qx, bias) qx = qx.reshape(output_shape) return qx diff --git a/vllm_mindspore/model_executor/model_loader/default_loader.py b/vllm_mindspore/model_executor/model_loader/default_loader.py new file mode 100644 index 00000000..dbd6ea8b --- /dev/null +++ b/vllm_mindspore/model_executor/model_loader/default_loader.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: Apache-2.0 +import glob +import os +from typing import Optional + +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME +from vllm.config import LoadFormat +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.model_executor.model_loader.weight_utils import ( + download_safetensors_index_file_from_hf, download_weights_from_hf, + filter_duplicate_safetensors_files, filter_files_not_needed_for_inference) + + +def _prepare_weights( + self, + model_name_or_path: str, + revision: Optional[str], + fall_back_to_pt: bool, + allow_patterns_overrides: Optional[list[str]], +) -> tuple[str, list[str], bool]: + """Prepare weights for the model. + + If the model is not local, it will be downloaded.""" + model_name_or_path = (self._maybe_download_from_modelscope( + model_name_or_path, revision) or model_name_or_path) + + is_local = os.path.isdir(model_name_or_path) + load_format = self.load_config.load_format + use_safetensors = False + index_file = SAFE_WEIGHTS_INDEX_NAME + # Some quantized models use .pt files for storing the weights. + if load_format == LoadFormat.AUTO: + allow_patterns = ["*.safetensors", "*.bin"] + elif (load_format == LoadFormat.SAFETENSORS + or load_format == LoadFormat.FASTSAFETENSORS): + use_safetensors = True + allow_patterns = ["*.safetensors"] + elif load_format == LoadFormat.MISTRAL: + use_safetensors = True + allow_patterns = ["consolidated*.safetensors"] + index_file = "consolidated.safetensors.index.json" + elif load_format == LoadFormat.PT: + allow_patterns = ["*.pt"] + elif load_format == LoadFormat.NPCACHE: + allow_patterns = ["*.bin"] + else: + raise ValueError(f"Unknown load_format: {load_format}") + + if fall_back_to_pt: + allow_patterns += ["*.pt"] + + if allow_patterns_overrides is not None: + allow_patterns = allow_patterns_overrides + + if not is_local: + hf_folder = download_weights_from_hf( + model_name_or_path, + self.load_config.download_dir, + allow_patterns, + revision, + ignore_patterns=self.load_config.ignore_patterns, + ) + else: + hf_folder = model_name_or_path + hf_weights_files: list[str] = [] + for pattern in allow_patterns: + hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) + if len(hf_weights_files) == 0: + tp_rank = get_tensor_model_parallel_rank() + hf_weights_files += glob.glob( + os.path.join(hf_folder, f"rank_{tp_rank}", pattern)) + if len(hf_weights_files) > 0: + if pattern == "*.safetensors": + use_safetensors = True + break + if use_safetensors: + # For models like Mistral-7B-Instruct-v0.3 + # there are both sharded safetensors files and a consolidated + # safetensors file. Using both breaks. + # Here, we download the `model.safetensors.index.json` and filter + # any files not found in the index. + if not is_local: + download_safetensors_index_file_from_hf( + model_name_or_path, + index_file, + self.load_config.download_dir, + revision, + ) + hf_weights_files = filter_duplicate_safetensors_files( + hf_weights_files, hf_folder, index_file) + else: + hf_weights_files = filter_files_not_needed_for_inference( + hf_weights_files) + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`") + + return hf_folder, hf_weights_files, use_safetensors diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py index fb155027..c8b6bc16 100644 --- a/vllm_mindspore/model_executor/model_loader/utils.py +++ b/vllm_mindspore/model_executor/model_loader/utils.py @@ -22,8 +22,12 @@ from mindspore import nn from vllm.config import ModelConfig, ModelImpl from vllm.model_executor.model_loader.utils import logger +from vllm.attention import Attention + from vllm.model_executor.models import ModelRegistry +from vllm_mindspore.model_executor.layers.quantization.base_config import ( + QuantizeMethodBase) from vllm_mindspore.model_executor.models.registry import ( AUTO_SELECT_FIXED_MODEL, MindSporeModelRegistry, mcore_support_list, mf_supported, mindone_supported) @@ -188,3 +192,54 @@ def get_ms_model_architecture( raise RecursionError("MindSpore unsupported reward model task now!") return model_cls, arch + + +def convert_uint64_to_fp32(arr: np.ndarray): + arr_fp32 = arr.view(np.float32) + output = arr_fp32[:, :, 0::2] + return output + + +def np_int4data_pack_to_int8_3d(np_data): + np_data = np_data.astype(np.int8) + np_data &= 0x000F + np_data[::, ::, 0::2] <<= 0 + np_data[::, ::, 1::2] <<= 4 + np_int4_data = np_data[::, ::, 0::2] | np_data[::, ::, 1::2] + return np_int4_data + + +def unpack_int8_to_int4_3d(packed_data): + low_nibbles = (packed_data & 0x0F).astype(np.uint8) + high_nibbles = ((packed_data >> 4) & 0x0F).astype(np.uint8) + + unpacked = np.empty((*packed_data.shape[:2], packed_data.shape[2] * 2), + dtype=np.uint8) + unpacked[..., 0::2] = low_nibbles + unpacked[..., 1::2] = high_nibbles + + return unpacked + + +def process_weights_after_loading(model: nn.Module, model_config: ModelConfig, + target_device: torch.device) -> None: + for _, module in model.named_modules(): + quant_method = getattr(module, "quant_method", None) + if isinstance(quant_method, QuantizeMethodBase): + # # When quant methods need to process weights after loading + # # (for repacking, quantizing, etc), they expect parameters + # # to be on the global target device. This scope is for the + # # case where cpu offloading is used, where we will move the + # # parameters onto device for processing and back off after. + # with device_loading_context(module, target_device): + quant_method.process_weights_after_loading(module) + + # Currently only used by MLA. + # NOTE: This intentionally happens after other modules so we can easily + # decompress the weights for MLA. + for _, module in model.named_modules(): + if isinstance(module, Attention) and \ + hasattr(module, "process_weights_after_loading"): + # TODO(lucas): see if there is a way to unify the signatures + # of process_weights_after_loading + module.process_weights_after_loading(model_config.dtype) diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py index e02de0ab..4e535c01 100644 --- a/vllm_mindspore/model_executor/model_loader/weight_utils.py +++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py @@ -17,16 +17,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import glob +import json +import os from collections.abc import Generator from typing import Any +import huggingface_hub import mindspore as ms +import numpy as np +from huggingface_hub import snapshot_download from mindspore import Parameter from safetensors import safe_open from tqdm.auto import tqdm +from vllm.config import LoadConfig +from vllm.model_executor.model_loader.weight_utils import (DisabledTqdm, + get_lock) + +from vllm_mindspore.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm_mindspore.platforms.ascend import ModelConfig from vllm_mindspore.utils import atlas_inference -import numpy as np from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT, enable_tqdm) @@ -79,3 +90,97 @@ def default_weight_loader(param: Parameter, loaded_weight: Any) -> None: """Default weight loader.""" loaded_weight = loaded_weight[:] param.set_data(ms.Tensor(loaded_weight, dtype=param.dtype)) + + +def get_quant_config(model_config: ModelConfig, + load_config: LoadConfig) -> QuantizationConfig: + + from vllm_mindspore.model_executor.layers.quantization import ( + get_quantization_config) + quant_cls = get_quantization_config(model_config.quantization) + + # GGUF doesn't have config file + if model_config.quantization == "gguf": + return quant_cls.from_config({}) + + # Read the quantization config from the HF model config, if available. + hf_quant_config = getattr(model_config.hf_config, "quantization_config", + None) + # some vision model may keep quantization_config in their text_config + hf_text_config = getattr(model_config.hf_config, "text_config", None) + if hf_quant_config is None and hf_text_config is not None: + hf_quant_config = getattr(hf_text_config, "quantization_config", None) + if hf_quant_config is None: + # compressed-tensors uses a compressions_config + hf_quant_config = getattr(model_config.hf_config, "compression_config", + None) + if hf_quant_config is not None: + if os.path.isdir(model_config.model): + quant_config_file = os.path.join( + model_config.model, + quant_cls.get_config_filenames()[0]) + with open(quant_config_file) as f: + quant_config = json.load(f) + return quant_cls.from_config(hf_quant_config | quant_config) + + # In case of bitsandbytes/QLoRA, get quant config from the adapter model. + if model_config.quantization == "bitsandbytes": + if (not load_config.model_loader_extra_config + or "qlora_adapter_name_or_path" + not in load_config.model_loader_extra_config): + return quant_cls.from_config({"adapter_name_or_path": ""}) + model_name_or_path = load_config.model_loader_extra_config[ + "qlora_adapter_name_or_path"] + + else: + model_name_or_path = model_config.model + is_local = os.path.isdir(model_name_or_path) + if not is_local: + # Download the config files. + with get_lock(model_name_or_path, load_config.download_dir): + hf_folder = snapshot_download( + model_name_or_path, + revision=model_config.revision, + allow_patterns="*.json", + cache_dir=load_config.download_dir, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + tqdm_class=DisabledTqdm, + ) + else: + hf_folder = model_name_or_path + + possible_config_filenames = quant_cls.get_config_filenames() + + # If the quantization config is not found, use the default config. + if not possible_config_filenames: + return quant_cls() + + config_files = glob.glob(os.path.join(hf_folder, "*.json")) + + quant_config_files = [ + f for f in config_files if any( + f.endswith(x) for x in possible_config_filenames) + ] + if len(quant_config_files) == 0: + raise ValueError( + f"Cannot find the config file for {model_config.quantization}") + if len(quant_config_files) > 1: + raise ValueError( + f"Found multiple config files for {model_config.quantization}: " + f"{quant_config_files}") + + quant_config_file = quant_config_files[0] + with open(quant_config_file) as f: + config = json.load(f) + + if model_config.quantization == "bitsandbytes": + config["adapter_name_or_path"] = model_name_or_path + elif model_config.quantization == "modelopt": + if config["producer"]["name"] == "modelopt": + return quant_cls.from_config(config) + else: + raise ValueError( + f"Unsupported quantization config" + f" found for {model_config.quantization} in {f}.") + + return quant_cls.from_config(config) diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 04115596..b9ddd507 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -45,6 +45,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.sequence import IntermediateTensors from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.distributed import get_tensor_model_parallel_rank from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE from vllm_mindspore.attention import Attention @@ -377,6 +378,10 @@ class Qwen2Model(nn.Cell): ] for name, loaded_weight in weights: + if get_tensor_model_parallel_rank( + ) > 0 and "o_proj.quant_bias" in name: + continue + if "rotary_emb.inv_freq" in name: continue if ("rotary_emb.cos_cached" in name -- Gitee From 2449b6755a07fe78724fe0b918bd9d13a8914af1 Mon Sep 17 00:00:00 2001 From: superxf Date: Wed, 30 Jul 2025 16:47:52 +0800 Subject: [PATCH 05/14] fix new branch --- .../model_executor/layers/linear.py | 43 ++++++------------- .../model_loader/weight_utils.py | 23 +++++++++- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py index ecf6298c..109118cf 100644 --- a/vllm_mindspore/model_executor/layers/linear.py +++ b/vllm_mindspore/model_executor/layers/linear.py @@ -342,23 +342,14 @@ class MergedColumnParallelLinear(ColumnParallelLinear): assert loaded_shard_id < len(self.output_sizes) shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size shard_size = self.output_sizes[loaded_shard_id] // tp_size - param_data = param.data - param_data = param_data.narrow(output_dim, shard_offset, - shard_size) - start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size).contiguous() - assert param_data.shape == loaded_weight.shape - if len(loaded_weight.shape) == 2: - param[shard_offset:shard_offset + - shard_size, :] = loaded_weight - else: - param[shard_offset:shard_offset + shard_size] = loaded_weight - else: - assert param.shape == loaded_weight.shape - if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16: - loaded_weight = loaded_weight.astype(ms.float16) - param.set_data(loaded_weight.contiguous()) + + start_idx = tp_rank * shard_size + loaded_weight = split_loaded_weight(loaded_weight, output_dim, + start_idx, shard_size) + if param.name.endswith("weight"): + assert loaded_weight.shape == (shard_size, param.shape[1]) + param[shard_offset:shard_offset + + shard_size] = ms.from_numpy(loaded_weight) class QKVParallelLinear(ColumnParallelLinear): @@ -441,11 +432,6 @@ class QKVParallelLinear(ColumnParallelLinear): loaded_weight, loaded_shard_id: Optional[str] = None): output_dim = getattr(param, "output_dim", None) - if output_dim is None: - if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16: - loaded_weight = loaded_weight.astype(ms.float16) - param.set_data(loaded_weight.contiguous()) - return tp_rank = get_tensor_model_parallel_rank() # QKV loaded weight is already fused on disk (qkv safetensors). @@ -495,13 +481,11 @@ class QKVParallelLinear(ColumnParallelLinear): start_idx, shard_size) loaded_weight = ms.from_numpy(loaded_weight) - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size).contiguous() - assert param_data.shape == loaded_weight.shape - if len(loaded_weight.shape) == 2: - param[shard_offset:shard_offset + shard_size, :] = loaded_weight - else: - param[shard_offset:shard_offset + shard_size] = loaded_weight + if param.name.endswith("weight"): + assert loaded_weight.shape == (shard_size, param.shape[1]) + if param.name.endswith("bias"): + assert loaded_weight.shape == (shard_size, ) + param[shard_offset:shard_offset + shard_size] = loaded_weight class RowParallelLinear(LinearBase): @@ -622,7 +606,6 @@ class RowParallelLinear(LinearBase): def weight_loader(self, param, loaded_weight): tp_rank = get_tensor_model_parallel_rank() - param_data = param.data input_dim = getattr(param, "input_dim", None) shard_size = self.input_size_per_partition start_idx = tp_rank * shard_size diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py index 4e535c01..4a0fdcf8 100644 --- a/vllm_mindspore/model_executor/model_loader/weight_utils.py +++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py @@ -31,7 +31,9 @@ from mindspore import Parameter from safetensors import safe_open from tqdm.auto import tqdm from vllm.config import LoadConfig -from vllm.model_executor.model_loader.weight_utils import (DisabledTqdm, +from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT, + DisabledTqdm, + enable_tqdm, get_lock) from vllm_mindspore.model_executor.layers.quantization.base_config import ( @@ -63,6 +65,11 @@ def split_loaded_weight(loaded_weight, shard_dim, start_idx, shard_size): loaded_weight = loaded_weight[:, :, start_idx:end_idx] else: raise ValueError("shard_dim:{} is not supported.".format(shard_dim)) + loaded_weight = ( + loaded_weight.astype(np.float16) + if (str(loaded_weight.dtype) == 'bfloat16' and is_310p()) + else loaded_weight + ) return loaded_weight @@ -79,16 +86,30 @@ def safetensors_weights_iterator( ): with safe_open(st_file, framework="np") as f: for name in f.keys(): # noqa: SIM118 +<<<<<<< HEAD # TODO: use slice x = f.get_tensor(name) x = x.astype(np.float16) \ if (str(x.dtype) == 'bfloat16' and atlas_inference()) else x yield name, ms.tensor(x) +======= + # Return a lightweight PySafeSlice object that uses file + # pointer offset internally to read Safetensor on demand, + # avoiding memory explosion. Actual data can be obtained + # through slicing operation like param[start:end] + param = f.get_slice(name) + yield name, param +>>>>>>> 8858529 (fix new branch) def default_weight_loader(param: Parameter, loaded_weight: Any) -> None: """Default weight loader.""" loaded_weight = loaded_weight[:] + loaded_weight = ( + loaded_weight.astype(np.float16) + if (str(loaded_weight.dtype) == 'bfloat16' and is_310p()) + else loaded_weight + ) param.set_data(ms.Tensor(loaded_weight, dtype=param.dtype)) -- Gitee From 3f4b5d6255f08cec22eefd50909a02beb63ecb6d Mon Sep 17 00:00:00 2001 From: HighCloud Date: Tue, 22 Jul 2025 14:36:53 +0800 Subject: [PATCH 06/14] change atlas_inference to is_310p --- vllm_mindspore/distributed/parallel_state.py | 5 +- .../model_loader/weight_utils.py | 10 +- .../models/mf_models/mf_model_base.py | 217 ----------- .../models/mf_models/weight_processor.py | 342 ++++++++++++++++++ .../model_executor/models/model_base.py | 13 +- vllm_mindspore/model_executor/models/qwen2.py | 6 +- vllm_mindspore/utils.py | 2 + vllm_mindspore/v1/worker/gpu_model_runner.py | 4 +- vllm_mindspore/worker/cache_engine.py | 7 +- vllm_mindspore/worker/model_runner.py | 6 +- 10 files changed, 367 insertions(+), 245 deletions(-) delete mode 100644 vllm_mindspore/model_executor/models/mf_models/mf_model_base.py create mode 100644 vllm_mindspore/model_executor/models/mf_models/weight_processor.py diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py index 697196fa..a3ef9fd8 100644 --- a/vllm_mindspore/distributed/parallel_state.py +++ b/vllm_mindspore/distributed/parallel_state.py @@ -4,7 +4,8 @@ from torch.distributed import ProcessGroup from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union) from vllm.distributed.parallel_state import _split_tensor_dict, TensorMetadata -from vllm_mindspore.utils import atlas_inference +from vllm_mindspore.utils import is_310p + def gc_broadcast_tensor_dict( self, @@ -20,7 +21,7 @@ def gc_broadcast_tensor_dict( if (not torch.distributed.is_initialized() or self.world_size == 1): return tensor_dict - if not atlas_inference(): + if not is_310p(): group = self.device_group metadata_group = self.cpu_group assert src < self.world_size, f"Invalid src rank ({src})" diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py index 4a0fdcf8..3642f234 100644 --- a/vllm_mindspore/model_executor/model_loader/weight_utils.py +++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py @@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT, from vllm_mindspore.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm_mindspore.platforms.ascend import ModelConfig -from vllm_mindspore.utils import atlas_inference +from vllm_mindspore.utils import is_310p from vllm.model_executor.model_loader.weight_utils import (_BAR_FORMAT, enable_tqdm) @@ -86,20 +86,12 @@ def safetensors_weights_iterator( ): with safe_open(st_file, framework="np") as f: for name in f.keys(): # noqa: SIM118 -<<<<<<< HEAD - # TODO: use slice - x = f.get_tensor(name) - x = x.astype(np.float16) \ - if (str(x.dtype) == 'bfloat16' and atlas_inference()) else x - yield name, ms.tensor(x) -======= # Return a lightweight PySafeSlice object that uses file # pointer offset internally to read Safetensor on demand, # avoiding memory explosion. Actual data can be obtained # through slicing operation like param[start:end] param = f.get_slice(name) yield name, param ->>>>>>> 8858529 (fix new branch) def default_weight_loader(param: Parameter, loaded_weight: Any) -> None: diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py deleted file mode 100644 index 20969730..00000000 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ /dev/null @@ -1,217 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Copyright 2025 Huawei Technologies Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from abc import abstractmethod -from collections.abc import Iterable -from typing import Optional, Union - -import mindspore as ms -from mindformers.core.context import build_mf_context -from mindformers.core.parallel_config import build_parallel_config -from mindformers.tools.register.config import MindFormerConfig -from mindformers.tools.utils import is_pynative -from mindspore import Tensor, nn -from mindspore.common.api import _pynative_executor -from mindspore.communication import get_rank -from vllm.config import VllmConfig -from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.distributed.parallel_state import get_dp_group -from vllm.forward_context import get_forward_context -from vllm.logger import init_logger -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from vllm_mindspore.model_executor.models.attention_mask import ( - LowerTriangularMask) -from vllm_mindspore.model_executor.models.model_base import MsModelBase -from vllm_mindspore.model_executor.models.utils import is_use_ringmla - -try: - # Need to apply dllm pd patch on vllm to use pd disagg related functions - from vllm.attention.layer import (maybe_save_kv_layer_to_connector, - wait_for_kv_layer_from_connector) - from vllm.distributed.kv_transfer import is_v1_kv_transfer_group - kv_transfer_supported = True -except: # noqa: E722 - kv_transfer_supported = False - -logger = init_logger(__name__) - - -class MfModelBase(MsModelBase): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: - super().__init__(vllm_config=vllm_config, prefix=prefix) - - model_config_path = os.getenv("MINDFORMERS_MODEL_CONFIG") - if model_config_path is None: - raise RuntimeError('For "MindFormers" model backend, environments ' - 'MINDFORMERS_MODEL_CONFIG should be set!') - - self.mf_config = MindFormerConfig(model_config_path) - self.rank_id = get_rank() - self.dp_size = get_dp_group() - - self.kv_transfer_config = vllm_config.kv_transfer_config - build_mf_context(self.mf_config) - build_parallel_config(self.mf_config) - self.mf_config.model.model_config.parallel_config = ( - self.mf_config.parallel_config) - self.mf_config.model.model_config.parallel_config.model_parallel = ( - get_tensor_model_parallel_world_size()) - self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 - self.use_ringmla = is_use_ringmla(vllm_config, self.mf_config) - self.is_chunked = False - self._generate_model_config() - if not hasattr(self, 'mf_model_config'): - raise RuntimeError('mf_model_config not initialized') - self.casual_mask = LowerTriangularMask( - dtype=self.mf_model_config.compute_dtype, - max_model_len=self.model_config.max_model_len) - self.network, self.lm_head = self._create_network() - - affinity_config = self.mf_config.get('context', - {}).get('affinity_cpu_list', {}) - if isinstance(affinity_config, dict): - ms.runtime.set_cpu_affinity(True, affinity_config) - - self._set_dynamic_inputs() - - @property - def ready_lm_head(self) -> nn.Cell: - if self.lm_head is None: - raise RuntimeError("lm_head not initialized") - return self.lm_head - - @abstractmethod - def _generate_model_config(self): - raise NotImplementedError( - "Function _generate_model_config should be Implemented!") - - @abstractmethod - def _create_network(self): - raise NotImplementedError( - "Function _create_network should be Implemented!") - - # DLLM - def is_decoder_task(self) -> bool: - if self.kv_transfer_config is None: - return False - - return self.kv_transfer_config.is_kv_consumer - - # DLLM - def is_prefill_task(self) -> bool: - if self.kv_transfer_config is None: - return False - - return self.kv_transfer_config.is_kv_producer - - def _set_dynamic_inputs(self): - self.network.set_dynamic_inputs() - if not hasattr(self, 'mf_model_config'): - raise RuntimeError('mf_model_config not initialized') - dynamic_hidden_states = Tensor( - shape=[None, None], dtype=self.mf_model_config.compute_dtype) - self.ready_lm_head.set_inputs(dynamic_hidden_states) - - def prepare_inputs(self, input_ids, positions): - return self.prepare_base_inputs(input_ids, positions) - - def update_model_inputs(self, model_inputs, **kwargs): - return model_inputs - - # DLLM - def connector_send_kvcache(self): - logger.debug("reached connector_send_kvcache") - _pynative_executor.sync() - forward_context = get_forward_context() - if not hasattr(self, 'mf_model_config'): - raise RuntimeError('mf_model_config not initialized') - for i in range(self.mf_model_config.num_layers): - kv_cache = self.kv_caches[i] - k_cache = kv_cache.kv_cache[forward_context.virtual_engine][0] - v_cache = kv_cache.kv_cache[forward_context.virtual_engine][1] - maybe_save_kv_layer_to_connector(str(i), (k_cache, v_cache)) - - # DLLM - def connector_wait_for_kv_layer(self): - logger.debug("reached connector_wait_for_kv_layer") - if not hasattr(self, 'mf_model_config'): - raise RuntimeError('mf_model_config not initialized') - for i in range(self.mf_model_config.num_layers): - wait_for_kv_layer_from_connector("key." + str(i)) - - def forward(self, - input_ids: Tensor, - positions: Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[Tensor] = None, - **kwargs) -> Union[Tensor, IntermediateTensors]: - model_inputs, is_prefill = self.prepare_inputs(input_ids, positions) - model_inputs = self.update_model_inputs(model_inputs, **kwargs) - - if is_prefill: - self.network.phase = "prefill" - if not self.set_flags or is_pynative(): - self.network.add_flags_custom(is_first_iteration=True) - hidden_states = self.network(**model_inputs) - self.network.phase = "increment" - if not self.set_flags or is_pynative(): - self.network.add_flags_custom(is_first_iteration=False) - self.set_flags = True - if kv_transfer_supported and is_v1_kv_transfer_group(): - self.connector_send_kvcache() - # DLLM - else: - if kv_transfer_supported: - if is_v1_kv_transfer_group() and self.is_prefill_task(): - self.connector_send_kvcache() - - if is_v1_kv_transfer_group() and self.is_decoder_task(): - self.connector_wait_for_kv_layer() - logger.debug("connector_wait_for_kv_layer success") - hidden_states = self.network(**model_inputs) - - return hidden_states - - def compute_logits( - self, - hidden_states: Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[Tensor]: - if sampling_metadata is not None: - selected_token_indices = sampling_metadata.selected_token_indices - if (selected_token_indices is not None - and selected_token_indices.numel() <= 0): - if not hasattr(self, 'mf_model_config'): - raise RuntimeError('mf_model_config not initialized') - logits = ms.mint.zeros( - (0, self.mf_model_config.vocab_size), - dtype=self.mf_model_config.compute_dtype) - else: - hidden_states = hidden_states.index_select( - 0, selected_token_indices) - logits = self.ready_lm_head(hidden_states) - logits = logits.view(-1, logits.shape[-1]) - else: - logits = self.ready_lm_head(hidden_states) - logits = logits.view(-1, logits.shape[-1]) - return logits - - def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> set[str]: - raise NotImplementedError("load_weight not implemented.") diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py new file mode 100644 index 00000000..5036323c --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -0,0 +1,342 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +transform huggingface safetensor. +""" + +import os +from enum import Enum + +from mindformers.parallel_core.inference import parallel_state as ps +from mindspore.communication.management import get_group_size, get_rank +from safetensors import safe_open +from vllm_mindspore.utils import is_310p + +class EPMethod(Enum): + """ + EP method enums + """ + DEFAULT = 'default' + ALLTOALL = 'alltoall' + ALLGATHER = 'allgather' + + +class BaseWeightProcessor: + r""" + Provide model weight load and shards. + Args: + config (MF Config): The config of Infer model. + network (InferenceModelForCausalLM): The network of infer model. + + """ + + def __init__(self, config, network, is_quant, vllm_config): + self.vllm_config = vllm_config + self.is_310p = is_310p() + self.config = config + self.network = network + self.is_quant = is_quant + self.global_rank_id = get_rank() + self.global_group_size = get_group_size() + self.tp_group_size = ps.get_tensor_model_parallel_world_size() + self.dp_group_size = ps.get_data_parallel_world_size() + self.num_router_experts = self.config.moe_config.expert_num if \ + self.config.moe_config.expert_num else 1 + self.moe_ep_size = ps.get_moe_expert_parallel_world_size() + self.moe_tp_size = ps.get_moe_tensor_parallel_world_size() + self.tp_dp_size = ps.get_tensor_and_data_parallel_world_size() + self.ep_method = EPMethod.DEFAULT + if self.dp_group_size > 1\ + and self.moe_ep_size == self.global_group_size: + self.ep_method = EPMethod.ALLTOALL + elif self.dp_group_size > 1: + self.ep_method = EPMethod.ALLGATHER + self.tp_rank_id = ps.get_tensor_model_parallel_rank() + self.tp_dp_rank_id = ps.get_tensor_and_data_parallel_rank() + + self.ep_group_nums = self.num_router_experts // self.moe_ep_size + self.moe_ep_rank_id = ps.get_moe_expert_parallel_rank() + self.moe_tp_rank_id = ps.get_moe_tensor_parallel_rank() + self.ep_start = self.moe_ep_rank_id * self.ep_group_nums + self.ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums + + self.parameter_dict = {} + self.file_handles = {} + + def get_file_handles(self, filename): + if filename not in self.file_handles: + fp = safe_open(filename, framework="np") + self.file_handles[filename] = fp + return self.file_handles[filename] + + def release_file_handles(self): + del self.file_handles + + def get_safetensor_from_file(self, hf_param_name, src_hf_dir, + hf_weight_map): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata( + ) is not None and hf_param_name in sf_file.metadata(): + qint4 = True + + np_data = sf_file.get_tensor(hf_param_name) + return np_data, qint4 + + def get_safetensor_from_file_split_tp_group(self, + hf_param_name, + src_hf_dir, + hf_weight_map, + split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata( + ) is not None and hf_param_name in sf_file.metadata(): + qint4 = True + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = np_data[:, start:stop] + elif split_axis == 2: + split_size = shape[2] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = np_data[:, :, start:stop] + else: + raise ValueError( + "split_axis:{} is not supported.".format(split_axis)) + return split_data, qint4 + + def get_safetensor_from_file_split_tpdp_group(self, + hf_param_name, + src_hf_dir, + hf_weight_map, + split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata( + ) is not None and hf_param_name in sf_file.metadata(): + qint4 = True + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.tp_dp_size + start = self.tp_dp_rank_id * split_size + stop = (self.tp_dp_rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.tp_dp_size + start = self.tp_dp_rank_id * split_size + stop = (self.tp_dp_rank_id + 1) * split_size + split_data = np_data[:, start:stop] + elif split_axis == 2: + split_size = shape[2] // self.tp_dp_size + start = self.tp_dp_rank_id * split_size + stop = (self.tp_dp_rank_id + 1) * split_size + split_data = np_data[:, :, start:stop] + else: + raise ValueError( + "split_axis:{} is not supported.".format(split_axis)) + return split_data, qint4 + + def get_safetensor_from_file_split_global_group(self, + hf_param_name, + src_hf_dir, + hf_weight_map, + split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata( + ) is not None and hf_param_name in sf_file.metadata(): + qint4 = True + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.global_group_size + start = self.global_rank_id * split_size + stop = (self.global_rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.global_group_size + start = self.global_rank_id * split_size + stop = (self.global_rank_id + 1) * split_size + split_data = np_data[:, start:stop] + elif split_axis == 2: + split_size = shape[2] // self.global_group_size + start = self.global_rank_id * split_size + stop = (self.global_rank_id + 1) * split_size + split_data = np_data[:, :, start:stop] + else: + raise ValueError( + "split_axis:{} is not supported.".format(split_axis)) + + return split_data, qint4 + + def get_safetensor_from_file_split_moe_tp_group(self, + hf_param_name, + src_hf_dir, + hf_weight_map, + split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata( + ) is not None and hf_param_name in sf_file.metadata(): + qint4 = True + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[:, start:stop] + else: + raise ValueError( + "split_axis:{} is not supported.".format(split_axis)) + + return split_data, qint4 + + def get_routed_safetensor_3_dim(self, + hf_param_name, + src_hf_dir, + hf_weight_map, + split_ep=False, + split_tp=False, + tp_axis=-1): + '''get_routed_safetensor_3_dim''' + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata( + ) is not None and hf_param_name in sf_file.metadata(): + qint4 = True + if not split_tp and not split_ep: + np_data = sf_file.get_tensor(hf_param_name) + return np_data, qint4 + + np_data = sf_file.get_slice(hf_param_name) + if not split_tp and split_ep: + split_data = np_data[self.ep_start:self.ep_stop, :, :] + return split_data, qint4 + + shape = np_data.get_shape() + if tp_axis == 1: + split_size = shape[1] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[ + self.ep_start:self.ep_stop, + start:stop, :] if split_ep else np_data[:, start:stop, :] + elif tp_axis == 2: + split_size = shape[2] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[ + self.ep_start:self.ep_stop, :, + start:stop] if split_ep else np_data[:, :, start:stop] + else: + raise ValueError("tp_axis:{} is not supported.".format(tp_axis)) + return split_data, qint4 + + def get_routed_safetensor_2_dim(self, + hf_param_name, + src_hf_dir, + hf_weight_map, + split_ep=False, + split_tp=False, + tp_axis=-1): + '''get_moe_routed_safetensor_2_dim''' + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata( + ) is not None and hf_param_name in sf_file.metadata(): + qint4 = True + if not split_tp and not split_ep: + np_data = sf_file.get_tensor(hf_param_name) + return np_data, qint4 + + np_data = sf_file.get_slice(hf_param_name) + if not split_tp and split_ep: + split_data = np_data[self.ep_start:self.ep_stop, :] + return split_data, qint4 + + shape = np_data.get_shape() + if tp_axis == 1: + split_size = shape[1] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[ + self.ep_start:self.ep_stop, + start:stop] if split_ep else np_data[:, start:stop] + else: + raise ValueError( + "split_tp is True but tp_axis:{} is not supported.".format( + tp_axis)) + return split_data, qint4 + + def split_weight_by_rank(self, weight, split_axis=0): + if self.tp_group_size == 1: + return weight + + shape = weight.shape + if split_axis == 0: + split_size = shape[0] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = weight[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = weight[:, start:stop] + else: + raise ValueError( + "split_axis:{} is not supported.".format(split_axis)) + return split_data + + def load_safetensors_shard(self, src_hf_dir): + """ load safetensors and shards """ + raise NotImplementedError( + "load_safetensors_shard method is not implemented.") diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 5fc07fd2..dd28a8c6 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -34,9 +34,9 @@ from vllm_mindspore.model_executor.models.attention_mask import ( LowerTriangularMask) from vllm_mindspore.model_executor.models.utils import is_use_ringmla from vllm_mindspore.model_executor.utils import set_model_context -from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE, create_kv_cache +from vllm_mindspore.utils import FORMAT_TYPE, STR_DTYPE_TO_MS_DTYPE, is_310p, create_kv_cache + from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata -from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE class AttentionWrapper: @@ -48,7 +48,7 @@ class AttentionWrapper: head_size = vllm_config.model_config.get_head_size() num_block = 0 - if atlas_inference(): + if is_310p(): self.kv_shape = [num_block, block_size, num_kv_heads * head_size] self.kv_cache = [ ( @@ -98,7 +98,7 @@ class MLAAttentionWrapper(AttentionWrapper): self.use_mla_op = bool( vllm_config.additional_config and vllm_config.additional_config.get('use_mla_op') == 1) - if atlas_inference(): + if is_310p(): self.kv_cache = [ ( ops.auto_generate.format_cast( @@ -481,8 +481,9 @@ class NativeModel(MsModelBase): block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - kv_cache_shape = (None, block_size, num_kv_heads * head_size) if atlas_inference() \ - else (None, block_size, num_kv_heads, head_size) + kv_cache_shape = (None, block_size, num_kv_heads * head_size) \ + if is_310p() else (None, block_size, num_kv_heads, + head_size) kv_cache_dtype = (self.model_config.dtype if self.cache_config.cache_dtype == "auto" else diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index b9ddd507..fa4dd43e 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -47,7 +47,7 @@ from vllm.sequence import IntermediateTensors from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.distributed import get_tensor_model_parallel_rank -from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE +from vllm_mindspore.utils import is_310p, FORMAT_TYPE from vllm_mindspore.attention import Attention from vllm_mindspore.model_executor.layers.activation import SiluAndMul from vllm_mindspore.model_executor.layers.layernorm import RMSNorm @@ -422,7 +422,7 @@ class Qwen2Model(nn.Cell): loaded_params.add(name) def adjust_weight(params_dict): - if not atlas_inference(): + if not is_310p(): return target_keywords = [ @@ -439,7 +439,7 @@ class Qwen2Model(nn.Cell): ms.runtime.synchronize() param.set_data(cast_weight) - if atlas_inference(): + if is_310p(): ms.runtime.synchronize() adjust_weight(params_dict) ms.runtime.synchronize() diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index 4da8b795..20aa3878 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -312,6 +312,8 @@ def check_ready(): "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": "FlashAttentionScore,PagedAttention", } + if atlas_inference(): + default_env["MS_ENABLE_INTERNAL_BOOST"] = "off" env_setup(default_env) if os.getenv("MS_MEMPOOL_BLOCK_SIZE"): diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 2ee11fde..56a8a120 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -444,7 +444,7 @@ def _reshape_kv_cache_tensors( kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) - if atlas_inference(): + if is_310p(): *dims, second_last, last = kv_cache_shape kv_cache_shape = (*dims, second_last * last) try: @@ -487,7 +487,7 @@ def _reshape_kv_cache_tensors( cache_block_nz = ops.auto_generate.format_cast( cache_block, 29) kv_cache_layer.append(cache_block_nz) - elif atlas_inference(): + elif is_310p(): from mindspore.common.api import _pynative_executor cache_block_nz = ops.auto_generate.format_cast(cache_block, FORMAT_TYPE['nz']) _pynative_executor.sync() diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py index 7675379c..e8c20397 100644 --- a/vllm_mindspore/worker/cache_engine.py +++ b/vllm_mindspore/worker/cache_engine.py @@ -26,7 +26,8 @@ from mindspore import mutable, mint, ops from typing import List from vllm.logger import init_logger -from vllm_mindspore.utils import MsKVCache, get_valid_dtype, atlas_inference, FORMAT_TYPE +from vllm_mindspore.utils import (MsKVCache, get_valid_dtype, is_310p, + FORMAT_TYPE) logger = init_logger(__name__) @@ -34,7 +35,7 @@ logger = init_logger(__name__) def create_block(shape, dtype, name=None, device=None): from mindspore.common.api import _pynative_executor blocks = mint.empty(*shape, dtype=dtype, device=device) - if device == "Ascend" and atlas_inference(): + if device == "Ascend" and is_310p(): blocks_nz = ops.auto_generate.format_cast(blocks, FORMAT_TYPE['nz']) _pynative_executor.sync() import gc @@ -53,7 +54,7 @@ def ms_allocate_kv_cache( """Allocates KV cache on the specified device.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) - if atlas_inference(): + if is_310p(): *dims, second_last, last = kv_cache_shape kv_cache_shape = (*dims, second_last * last) kv_cache: List[MsKVCache] = [] diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py index 6ab97c1b..1c37be98 100644 --- a/vllm_mindspore/worker/model_runner.py +++ b/vllm_mindspore/worker/model_runner.py @@ -28,7 +28,7 @@ from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.sequence import SequenceGroupMetadata -from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE, atlas_inference +from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE, is_310p logger = init_logger(__name__) @@ -140,8 +140,8 @@ def _dummy_run(self, block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - kv_shape = [0, block_size, num_kv_heads * head_size] if atlas_inference() else \ - [0, block_size, num_kv_heads, head_size] + kv_shape = [0, block_size, num_kv_heads * head_size] \ + if is_310p() else [0, block_size, num_kv_heads, head_size] kv_caches = mutable([ mutable( ( -- Gitee From 51a9dd20459b45830f4a408fa6caa9b12340ebc1 Mon Sep 17 00:00:00 2001 From: luolihao Date: Thu, 24 Jul 2025 19:08:26 +0800 Subject: [PATCH 07/14] support qwq w8a8sc --- .../layers/quantization/__init__.py | 9 +- .../quantization/sparse_quant_modelslim.py | 182 ++++++++++++++++++ vllm_mindspore/model_executor/models/qwen2.py | 50 ++++- 3 files changed, 238 insertions(+), 3 deletions(-) create mode 100644 vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py diff --git a/vllm_mindspore/model_executor/layers/quantization/__init__.py b/vllm_mindspore/model_executor/layers/quantization/__init__.py index 6c9e2e41..3c6c2da9 100644 --- a/vllm_mindspore/model_executor/layers/quantization/__init__.py +++ b/vllm_mindspore/model_executor/layers/quantization/__init__.py @@ -21,7 +21,10 @@ from typing import Literal, get_args from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -QuantizationMethods = Literal["smoothquant"] +QuantizationMethods = Literal[ + "smoothquant", + "sparsequant" +] QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods)) # The customized quantization methods which will be added to this dict. @@ -34,8 +37,10 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: # lazy import to avoid triggering `torch.compile` too early from .smooth_quant_modelslim import SmoothQuantModelSlimConfig + from .sparse_quant_modelslim import SparseQuantModelSlimConfig method_to_config: dict[str, type[QuantizationConfig]] = { - "smoothquant": SmoothQuantModelSlimConfig + "smoothquant": SmoothQuantModelSlimConfig, + "sparsequant": SparseQuantModelSlimConfig } # Update the `method_to_config` with customized quantization methods. method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) diff --git a/vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py b/vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py new file mode 100644 index 00000000..f6ede5ed --- /dev/null +++ b/vllm_mindspore/model_executor/layers/quantization/sparse_quant_modelslim.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from typing import Any, Optional, Dict + +import torch +import numpy as np +import mindspore + +from mindspore.common.initializer import initializer +from mindspore import Parameter, ops, Tensor +from mindspore.ops.operations._infer_ops import QuantV2 +from mindspore.communication import get_rank +from vllm_mindspore.model_executor.layers.linear import LinearMethodBase, UnquantizedLinearMethod, LinearBase + +from .base_config import QuantizationConfig + + + +class SparseQuantModelSlimConfig(QuantizationConfig): + '''Config class for SparseQuant.''' + + def __init__( + self, + full_config: Dict[str, Any], + weight_bits: Optional[int] = 8, + group_size: Optional[int] = 1, + zero_point: Optional[bool] = True, + dynamic_quant: Optional[bool] = False, + kv_cache_bits: Optional[int] = 16, + modules_to_not_convert: Optional[list[str]] = None, + ) -> None: + super().__init__() + self.full_config = full_config + self.weight_bits = weight_bits + self.group_size = group_size + self.zero_point = zero_point + self.dynamic_quant = dynamic_quant + self.kv_cache_bits = kv_cache_bits + self.modules_to_not_convert = modules_to_not_convert or [] + + if self.weight_bits != 8: + raise ValueError( + "Currently, only 8-bit weight quantization is supported for " + f"A8W8SC, but got {self.weight_bits} bits.") + self.pack_factor = 8 // self.weight_bits + + def __repr__(self) -> str: + return (f"SparseConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"zero_point={self.zero_point}, " + f"modules_to_not_convert={self.modules_to_not_convert})") + + @staticmethod + def get_config_filenames() -> list[str]: + return [ + "quant_model_description.json" + ] + + @classmethod + def get_min_capability(cls) -> int: + """Minimum GPU capability to support the quantization method. + + E.g., 70 for Volta, 75 for Turing, 80 for Ampere. + This requirement is due to the custom CUDA kernels used by the + quantization method. + """ + return -1 + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "SparseQuantModelSlimConfig": + return cls(config) + + def get_name(self) -> str: + return "SparseQuant" + + def get_supported_act_dtypes(self) -> list[torch.dtype]: + return [torch.int8, torch.float16, torch.bfloat16] + + def get_quant_method(self, layer: mindspore.nn.Cell, + prefix: str) -> "QuantizeMethodBase": + + rank_id = get_rank() + sparse_quant_description = self.full_config[f'rank_{rank_id}'] + if isinstance(layer, LinearBase) and sparse_quant_description[f"{prefix}.weight"].lower() == "w8a8s": + compress_weight_size = sparse_quant_description[f"{prefix}.weight.shape"] + compress_index_size = sparse_quant_description[f"{prefix}.index.shape"] + + return A8W8SCLinearMethod(self, compress_weight_size[0], compress_index_size[0]) + + return UnquantizedLinearMethod() + + +class A8W8SCLinearMethod(LinearMethodBase): + '''Linear method for A8W8SCLinearMethod.''' + + def __init__(self, quant_config: SparseQuantModelSlimConfig, compress_weight_size=None, compress_index_size=None): + self.quant_config = quant_config + self.compress_weight_size = compress_weight_size + self.compress_index_size = compress_index_size + + self.quant = QuantV2() + self.linear_sparse = ops.auto_generate.QuantLinearSparse() + + def create_weights(self, + layer: mindspore.nn.Cell, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype, + is_group_mm=False, + expert_num_per_partition=1, + **extra_weight_attrs): + if input_size_per_partition % self.quant_config.group_size != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + + output_size_per_partition = sum(output_partition_sizes) + self.output_size_per_partition = output_size_per_partition + self.input_size_per_partition = input_size_per_partition + if output_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + "The output size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + + weight = Parameter(initializer('normal', (self.compress_weight_size), mindspore.int8), name="weight") + index = Parameter(initializer('normal', (self.compress_index_size), mindspore.int8), name="index") + deq_scale = Parameter(initializer('normal', (self.output_size_per_partition), mindspore.int64), + name="deq_scale") + quant_bias = Parameter(initializer('zeros', (self.output_size_per_partition), mindspore.int32), + name="quant_bias") + input_scale = Parameter(Tensor(np.ones(self.input_size_per_partition), mindspore.float16), + name="input_scale") + input_offset = Parameter(Tensor(np.zeros(self.input_size_per_partition), mindspore.int8), + name="input_offset") + + layer.insert_param_to_cell("weight", weight) + layer.insert_param_to_cell("index", index) + layer.insert_param_to_cell("deq_scale", deq_scale) + layer.insert_param_to_cell("quant_bias", quant_bias) + layer.insert_param_to_cell("input_scale", input_scale) + layer.insert_param_to_cell("input_offset", input_offset) + + def apply(self, + layer: mindspore.nn.Cell, + x: mindspore.Tensor, + bias: mindspore.Parameter = None, group_list=None, cumsum_flag=False) -> mindspore.Tensor: + weight = layer.weight + index = layer.index + deq_scale = layer.deq_scale + quant_bias = layer.quant_bias + input_scale = layer.input_scale + input_offset = layer.input_offset + + output_shape = x.shape[:-1] + (self.output_size_per_partition,) + x = x.reshape(-1, self.input_size_per_partition) + + x = self.quant(x, input_scale, input_offset, False, "ROUND", mindspore.int8) + x = self.linear_sparse(x, weight, deq_scale, index, quant_bias) + + x = x.reshape(output_shape) + + return x \ No newline at end of file diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index fa4dd43e..eff9be31 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -64,6 +64,7 @@ from vllm_mindspore.model_executor.models.model_base import (NativeModel) from vllm_mindspore.model_executor.models.utils import ( PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from mindspore.communication.management import get_rank class Qwen2MLP(nn.Cell): @@ -365,6 +366,50 @@ class Qwen2Model(nn.Cell): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states + def load_split_weights(self, weights: Iterable[tuple[str, Tensor]], + params_dict: dict[str, Parameter]): + weights_dict = dict(weights) + + for name, loaded_weight in weights_dict.items(): + if get_tensor_model_parallel_rank( + ) > 0 and "o_proj.quant_bias" in name: + continue + + if name not in params_dict: + continue + + param = params_dict[name] + param.set_data(loaded_weight.contiguous()) + + def adjust_weight(params_dict): + if not is_310p(): + return + + target_keywords = [ + "qkv_proj.weight", + "o_proj.weight", + "gate_up_proj.weight", + "down_proj.weight", + # "lm_head.weight", + ] + + rank_id = get_rank() + for name, param in params_dict.items(): + if any(name.endswith(keyword) for keyword in target_keywords): + weight_type = self.quant_config.full_config[f"rank_{rank_id}"][name] + if weight_type.lower() == "w8a8s": + # 压缩后权重不需要转Nz + continue + + cast_weight = ops.auto_generate.format_cast(param, FORMAT_TYPE['nz']) + ms.runtime.synchronize() + param.set_data(cast_weight) + + if is_310p(): + ms.runtime.synchronize() + adjust_weight(params_dict) + ms.runtime.synchronize() + def load_weights(self, weights: Iterable[tuple[str, Tensor]], params_dict: dict[str, Parameter]): loaded_params: set[str] = set() @@ -514,7 +559,10 @@ class Qwen2ForCausalLM(NativeModel, SupportsLoRA): def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> set[str]: params_dict = self.get_params_dict() - self.model.load_weights(weights, params_dict) + if self.vllm_config.model_config.quantization == "sparsequant": + self.model.load_split_weights(weights, params_dict) + else: + self.model.load_weights(weights, params_dict) def compute_logits( self, -- Gitee From 93fb64554ecd426c8b2b8a81d4245f4140e661a6 Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Sat, 23 Aug 2025 09:34:11 +0800 Subject: [PATCH 08/14] graph mode support mutilora --- vllm_mindspore/lora/layers.py | 231 +++++------ vllm_mindspore/lora/models.py | 7 +- .../lora/punica_wrapper/punica_npu.py | 371 ++++++------------ vllm_mindspore/lora/utils.py | 24 +- .../model_executor/model_loader/utils.py | 31 +- 5 files changed, 239 insertions(+), 425 deletions(-) diff --git a/vllm_mindspore/lora/layers.py b/vllm_mindspore/lora/layers.py index d3f4b367..7d565138 100644 --- a/vllm_mindspore/lora/layers.py +++ b/vllm_mindspore/lora/layers.py @@ -24,16 +24,18 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Optional, Union, cast import mindspore as ms -from mindspore import mint +from mindspore import Parameter, ops, mint +from mindspore.common.initializer import initializer +import torch.nn.functional as F from transformers import PretrainedConfig from vllm.adapter_commons.layers import AdapterMapping from vllm.config import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, - tensor_model_parallel_all_gather, - tensor_model_parallel_all_reduce) + tensor_model_parallel_all_gather) from vllm.distributed.utils import divide +# yapf: enable from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding, RotaryEmbedding) @@ -320,49 +322,26 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): self.output_size, self.tp_size)) else: raise NotImplementedError - - self.lora_a_stacked = tuple( - mint.zeros( - ( - max_loras, - 1, - lora_a_out_size, - self.input_size, - ), - dtype=lora_config.lora_dtype, - ) for _ in range(self.n_slices)) - self.lora_b_stacked = tuple( - mint.zeros( - ( - max_loras, - 1, - lora_b_out_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - ) for _ in range(self.n_slices)) + self.lora_a_stacked = Parameter( + initializer('zeros', (max_loras, self.input_size, lora_a_out_size), + lora_config.lora_dtype)) + self.lora_b_stacked = Parameter( + initializer('zeros', (max_loras, lora_a_out_size, lora_b_out_size), + lora_config.lora_dtype)) if lora_config.bias_enabled: lora_bias_out_size = lora_b_out_size - self.lora_bias_stacked = tuple( - mint.zeros( - ( - max_loras, - 1, - lora_bias_out_size, - ), - dtype=lora_config.lora_dtype, - ) for _ in range(self.n_slices)) - self.output_slices = (self.lora_b_stacked[0].shape[2], ) + self.lora_bias_stacked = Parameter( + initializer('zeros', (max_loras, lora_bias_out_size), + lora_config.lora_dtype)) + else: + self.lora_bias_stacked = None def reset_lora(self, index: int): - for s_index in range(self.n_slices): - self.lora_a_stacked[s_index][index] = 0 - self.lora_b_stacked[s_index][index] = 0 - if self.lora_config.bias_enabled: - # Make mypy happy - self.lora_bias_stacked = cast(tuple[ms.Tensor, ...], - self.lora_bias_stacked) - self.lora_bias_stacked[s_index][index] = 0 + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + if self.lora_bias_stacked: + # Make mypy happy + self.lora_bias_stacked[index] = 0 def set_lora( self, @@ -376,8 +355,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers # store weights in a tuple of size 1. These two layers will # override this function. - assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == - self.n_slices == 1) self.reset_lora(index) if self.tp_size > 1: @@ -385,29 +362,44 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): lora_b = self.slice_lora_b(lora_b) if lora_bias is not None: lora_bias = self.slice_bias(lora_bias) - - self.lora_a_stacked[0][index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[0][index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if lora_bias is not None: - - self.lora_bias_stacked = cast(tuple[ms.Tensor, ...], - self.lora_bias_stacked) - assert len(self.lora_bias_stacked) - self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( - lora_bias.T, non_blocking=True) + if self.n_slices == 3: + self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0] + self.lora_a_stacked[index, :, lora_a[0].shape[1] : lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1] + self.lora_a_stacked[index, :, lora_a[0].shape[1] + lora_a[1].shape[1] :] = lora_a[2] + self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0] + self.lora_b_stacked[index, lora_b[0].shape[0] : lora_b[0].shape[0] + lora_b[1].shape[0], lora_b[0].shape[1] : lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1] + self.lora_b_stacked[index, lora_b[0].shape[0] + lora_b[1].shape[0] :, lora_b[0].shape[1] + lora_b[1].shape[1] :] = lora_b[2] + if self.lora_bias_stacked is not None: + assert len(self.lora_bias_stacked) + lora_bias = ops.concat(lora_bias, axis=0) + self.lora_bias_stacked[index] = lora_bias + elif self.n_slices == 2: + self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0] + self.lora_a_stacked[index, :, lora_a[0].shape[1]: lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1] + self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0] + self.lora_b_stacked[index, lora_b[0].shape[0]: lora_b[0].shape[0] + lora_b[1].shape[0], + lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1] + if self.lora_bias_stacked is not None: + assert len(self.lora_bias_stacked) + lora_bias = ops.concat(lora_bias, axis=0) + self.lora_bias_stacked[index] = lora_bias + if self.lora_bias_stacked is not None: + assert len(self.lora_bias_stacked) + lora_bias = ops.concat(lora_bias, axis=0) + self.lora_bias_stacked[index] = lora_bias + else: + self.lora_a_stacked[index] = lora_a + self.lora_b_stacked[index] = lora_b + if self.lora_bias_stacked is not None: + assert len(self.lora_bias_stacked) + self.lora_bias_stacked[index] = lora_bias def apply(self, x: ms.Tensor, bias: Optional[ms.Tensor] = None) -> ms.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, - self.lora_b_stacked, - self.lora_bias_stacked, 1.0, - self.output_slices) + output = self.punica_wrapper(output, x, self.lora_a_stacked, self.lora_b_stacked, + self.lora_bias_stacked, 1.0) return output @@ -540,7 +532,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): model_config: Optional[PretrainedConfig] = None, ) -> None: """ - The main reason for overriding this function is to enhance code + The main reason for overriding this function is to enhance code maintainability. """ self.lora_config = lora_config @@ -548,36 +540,19 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): lora_a_output_size_per_partition = ( lora_config.max_lora_rank if not lora_config.fully_sharded_loras else divide(lora_config.max_lora_rank, self.tp_size)) - self.lora_a_stacked = tuple( - mint.zeros( - ( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - ), - dtype=lora_config.lora_dtype, - ) for _ in range(self.n_slices)) - self.lora_b_stacked = tuple( - mint.zeros( - ( - max_loras, - 1, - output_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - ) for output_size in self.output_slices) + output_size = sum(self.output_slices) + self.lora_a_stacked = Parameter( + initializer('zeros', (max_loras, self.input_size, lora_a_output_size_per_partition * self.n_slices), + lora_config.lora_dtype)) + self.lora_b_stacked = Parameter( + initializer('zeros', (max_loras, lora_a_output_size_per_partition * self.n_slices, output_size), + lora_config.lora_dtype)) if lora_config.bias_enabled: - self.lora_bias_stacked = tuple( - mint.zeros( - ( - max_loras, - 1, - output_size, - ), - dtype=lora_config.lora_dtype, - ) for output_size in self.output_slices) + self.lora_bias_stacked = Parameter( + initializer('zeros', (max_loras, output_size), + lora_config.lora_dtype)) + else: + self.lora_bias_stacked = None def slice_lora_a( self, lora_a: list[Union[ms.Tensor, @@ -619,26 +594,20 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): lora_b = self.slice_lora_b(lora_b) if lora_bias is not None: lora_bias = self.slice_bias(lora_bias) - + lora_weight_list = [] for i in range(self.n_slices): - if (lora_a_i := lora_a[i]) is not None: - self.lora_a_stacked[i][ - index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_( - lora_a_i.T, non_blocking=True) - if (lora_b_i := lora_b[i]) is not None: - self.lora_b_stacked[i][ - index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_( - lora_b_i.T, non_blocking=True) + if (lora_a_i := lora_a[i]) is not None and (lora_b_i := lora_b[i]) is not None: + lora_weight_list.append(ops.matmul(lora_a_i.transpose(1, 0), lora_b_i.transpose(1, 0))) + lora_weight = ops.cat(tuple(lora_weight_list), axis=-1) + self.lora_weight[index] = lora_weight + lora_bias_list = [] if lora_bias is not None: - self.lora_bias_stacked = cast(tuple[ms.Tensor, ...], - self.lora_bias_stacked) for i in range(self.n_slices): if (lora_bias_i := lora_bias[i]) is not None: - self.lora_bias_stacked[i][index, - 0, :lora_bias_i.shape[0]].copy_( - lora_bias_i.T, - non_blocking=True) + lora_bias_list.appendd(lora_bias_i) + lora_bias = ops.cat(tuple(lora_weight_list), axis=0) + self.lora_bias_stacked[index] = lora_bias @classmethod @_not_fully_sharded_can_replace @@ -757,18 +726,6 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): self.kv_shard_id, ) - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - """ - The main reason for overloading this function is to handle inconsistent - weight dimensions in qkv lora. - """ - super().create_lora_weights(max_loras, lora_config, model_config) - @classmethod @_not_fully_sharded_can_replace def can_replace_layer( @@ -836,7 +793,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): # Matrix multiply. output_parallel = self.apply(input_parallel) if self.base_layer.reduce_results and self.base_layer.tp_size > 1: - output_ = tensor_model_parallel_all_reduce(output_parallel) + output_ = self.base_layer.tensor_model_parallel_all_reduce(output_parallel) else: output_ = output_parallel @@ -1012,24 +969,22 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): return None if self.sharded_to_full_mapping_gpu is not None: - """ - Reindex full logits tensor to ensure 1:1 mapping between - index and token_id - Example for: - org_vocab_size = 4 - added_vocab_size = 2 - pad_to_size = 8 - tp_size = 2 - - indices: [0, 1, 2, 3, 4, 5, 6, 7] - token_id: [0, 1, 4, -1, 2, 3, 5, -1] - - Therefore, the mapping is expected to be: - [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex, - we get: - indices: [0, 1, 2, 3, 4, 5, 6, 7] - token_id: [0, 1, 2, 3, 4, 5, -1, -1] - """ + # Reindex full logits tensor to ensure 1:1 mapping between + # index and token_id + # Example for: + # org_vocab_size = 4 + # added_vocab_size = 2 + # pad_to_size = 8 + # tp_size = 2 + + # indices: [0, 1, 2, 3, 4, 5, 6, 7] + # token_id: [0, 1, 4, -1, 2, 3, 5, -1] + + # Therefore, the mapping is expected to be: + # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex, + # we get: + # indices: [0, 1, 2, 3, 4, 5, 6, 7] + # token_id: [0, 1, 2, 3, 4, 5, -1, -1] logits = logits[:, self.sharded_to_full_mapping_gpu] lora_logits = mint.empty( diff --git a/vllm_mindspore/lora/models.py b/vllm_mindspore/lora/models.py index 621f609a..253242b0 100644 --- a/vllm_mindspore/lora/models.py +++ b/vllm_mindspore/lora/models.py @@ -20,6 +20,7 @@ """Models for Multi-LoRA.""" import os +import numpy as np from typing import Optional, Union import mindspore as ms @@ -33,6 +34,7 @@ from vllm.model_executor.models.utils import WeightsMapper from vllm.utils import is_pin_memory_available from vllm_mindspore.lora.layers import BaseLayerWithLoRA +from vllm_mindspore.utils import is_310p _GLOBAL_LORA_ID = 0 @@ -197,7 +199,10 @@ def from_local_checkpoint( check_unexpected_modules(f) for module in f.keys(): # noqa # vllm-mindspore add numpy to tensor - tensors[module] = mint.Tensor(f.get_tensor(module)) + np_data = f.get_tensor(module) + if is_310p() and str(np_data.dtype) == "bfloat16": + np_data = np_data.astype(np.float32).astype(np.float16) + tensors[module] = mint.Tensor(np_data) elif os.path.isfile(lora_bin_file_path): # When a bin file is provided, we rely on config to find unexpected # modules. diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py index 0a60baf2..d9295f66 100644 --- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py +++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py @@ -19,215 +19,110 @@ # isort: skip_file """Punica wrapper for NPU.""" -from typing import Callable +from typing import Callable, Optional -from mindspore import mint +from mindspore import mint, nn, Parameter, ops, dtype from mindspore.common import dtype as mstype +from mindspore.common.initializer import initializer +from mindspore.ops.auto_generate import grouped_matmul_v4 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase from vllm_mindspore.lora.ops.torch_ops.lora_ops import ( bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, - sgmv_expand_slice, sgmv_shrink) + sgmv_expand_slice, sgmv_shrink, sort_lora_by_token_count, einsum_ms) +from vllm_mindspore.model_executor.utils import get_model_context # The platforms that are compatible with the PyTorch-native implementation can # inherit this class -class PunicaWrapperNPU(PunicaWrapperBase): +class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): """ - PunicaWrapperNPU is designed to manage and provide metadata for the punica - kernel. The main function is to maintain the state information for + PunicaWrapperAtlas is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for Multi-LoRA, and to provide the interface for the pytorch punica ops. """ def __init__(self, max_num_batched_tokens, max_batches, device, **kwargs): + nn.Cell.__init__(self) PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) - - def _shrink_prefill( - self, - y, - x, - w_t_all, - scale, - ): - sgmv_shrink( # type: ignore - x, - w_t_all, - y, - *self.prefill_metadata, - scale, - ) - - def _shrink_decode( - self, - y, - x, - w_t_all, - scale, - ): - bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) - - def _expand_prefill( - self, - y, - x, - w_t_all, - add_inputs, - ): - sgmv_expand( # type: ignore - x, - w_t_all, - y, - *self.prefill_metadata, - add_inputs, - ) - - def _expand_decode( - self, - y, - x, - w_t_all, - add_inputs, - ): - bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs) - - def _expand_slice_prefill( - self, - y, - x, - w_t_all, - y_offset, - y_slice_size, - add_inputs, - ): - sgmv_expand_slice( # type: ignore - x, - w_t_all, - y, - *self.prefill_metadata, - y_offset, - y_slice_size, - add_inputs, - ) - - def _expand_slice_decode( - self, - y, - x, - w_t_all, - y_offset, - y_slice_size, - add_inputs, - ): - bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, - y_slice_size, add_inputs) - - def _apply_expand( - self, - y, - x, - w_t_all, - y_offset, - y_slice_size, - add_inputs, - ): - """ - Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` - computation, which is suitable for the - GEMM of lora'b. - """ - - expand_slice_fun: Callable = (self._expand_slice_prefill - if self.is_prefill else - self._expand_slice_decode) - expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs) - - def _apply_shrink(self, y, x, w_t_all, scale): - """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the - GEMM of lora'a. - When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `_shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the _shrink_decode function - should be called. - """ - y_org = y - y = y.view(-1, y.shape[-1]) - shrink_fun: Callable = (self._shrink_prefill - if self.is_prefill else self._shrink_decode) - shrink_fun(y, x, w_t_all, scale) - y.view_as(y_org) - - def add_shrink(self, y, x, lora_a_stacked, scale, **kwargs): - """ - Performs GEMM for multiple slices of lora_a. - When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `_shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the _shrink_decode function - should be called. - - Semantics: - for i in range(len(lora_a_stacked)): - y[i] += (x @ lora_a_stacked[i]) * scale - - Args: - y (Union[Tuple[ms.Tensor, ...], ms.Tensor]): Output tensors - x (ms.Tensor): Input tensor - lora_a_stacked (Tuple[ms.Tensor, ...]): lora_a's weights - scale (float): Scaling factor for the operation - """ - - x = x.view(-1, x.shape[-1]) - # TODO fuse these kernels - for slice_idx in range(len(lora_a_stacked)): - self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], - scale) - - def add_expand(self, - y, - x, - lora_b_stacked, - lora_bias_stacked, - output_slices, - offset_start=0, - add_inputs=True, - **kwargs) -> None: - """ - Performs GEMM and bias addition for multiple slices of lora_b. - - Semantics: - for i in range(len(lora_b_stacked)): - slice = output_slices[i] - y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + - lora_bias_stacked[i] - offset += slice - - Args: - y (ms.Tensor): Output tensor. - x (Union[Tuple[ms.Tensor, ...], ms.Tensor]): Input tensors - lora_b_stacked (Tuple[ms.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[Tuple[ms.Tensor, ...]]): - bias's weight - output_slices (Tuple[int, ...]): Every slice's size - add_inputs (bool): Defaults to True. - """ - y_org = y - y = y.view(-1, y.shape[-1]) - offset_left = offset_start - if lora_bias_stacked is not None: - self._apply_bias(self.token_lora_indices, y, output_slices, - lora_bias_stacked) - for slice_idx in range(len(lora_b_stacked)): - self._apply_expand( - y, - x[slice_idx], - lora_b_stacked[slice_idx], - offset_left, - output_slices[slice_idx], - add_inputs=add_inputs, - ) - offset_left += output_slices[slice_idx] - y.view_as(y_org) + self.max_loras = kwargs["max_loras"] + self.group_list = Parameter(initializer("ones", self.max_loras, dtype.int64), name="group_list") + self.lora_indices = Parameter(initializer("ones", self.max_loras, dtype.int64), name="lora_indices") + + def sgmv_shrink(self, + inputs, + lora_a_weights, + group_list, + scaling, + ): + outputs = grouped_matmul_v4([inputs], [lora_a_weights], + group_list=group_list, + split_item=3, + group_type=0, + group_list_type=1)[0] + return outputs * scaling + + def bgmv_shrink(self, + inputs, + lora_a_weights, + lora_indices_tensor, + scaling=1.0): + selected_loras = lora_a_weights[lora_indices_tensor] + inputs = inputs.astype(lora_a_weights[0].dtype) + selected_loras = selected_loras.squeeze(1) + outputs = einsum_ms(inputs, selected_loras) + return scaling * outputs + + def sgmv_expand_slice(self, + inputs, + lora_b_weights, + group_list + ): + outputs = grouped_matmul_v4([inputs], [lora_b_weights], + group_list=group_list, + split_item=3, + group_type=0, + group_list_type=1)[0] + return outputs + + def bgmv_expand_slice(self, + inputs, + lora_b_weights, + lora_indices_tensor): + selected_loras = lora_b_weights[lora_indices_tensor] + inputs = inputs.astype(lora_b_weights[0].dtype) + selected_loras = selected_loras.squeeze(1) + outputs = einsum_ms(inputs, selected_loras) + return outputs + + def update_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: list[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + **kwargs): + self._update_base_metadata(mapping, lora_index_to_id, max_loras, + vocab_size, extra_vocab_size, + long_lora_context) + if mapping.is_prefill: + # Update metadata required for prefill-related operators. + self._update_prefill_metadata(self.token_lora_indices) + self.is_prefill = True + else: + self.is_prefill = False + _, seq_len, lora_indices, _, _, _ = self.prefill_metadata + sorted_ids, sorted_counts = sort_lora_by_token_count( + lora_indices, seq_len) + group_list = sorted_counts + if len(group_list) < self.max_loras: + new_tensor = mint.zeros(self.max_loras, dtype=group_list.dtype) + new_tensor[:group_list.size(0)] = group_list + group_list = new_tensor + self.group_list.set_data(group_list.astype(dtype.int64)) def add_lora_embedding(self, y, @@ -247,7 +142,7 @@ class PunicaWrapperNPU(PunicaWrapperBase): lora_b_stacked (ms.Tensor): lora_b's weights. add_inputs (bool): Default to True. """ - #No LoRA request, so return directly + # No LoRA request, so return directly if self.no_lora: return # Embedding layer only need expand op @@ -255,65 +150,6 @@ class PunicaWrapperNPU(PunicaWrapperBase): if self.is_prefill else self._expand_decode) expand_fun(y, x, lora_b_stacked, add_inputs) - def add_lora_linear(self, - y, - x, - lora_a_stacked, - lora_b_stacked, - lora_bias_stacked, - scale, - output_slices, - *, - buffer=None, - **kwargs) -> None: - """ - Applicable to linear-related lora. - - Semantics: - for i in range(len(lora_a_stacked)): - y[i] += ( - x[i].unsqueeze(0) - @ lora_a_stacked[indices[i], layer_idx, :, :] - @ lora_b_stacked[indices[i], layer_idx, :, :] - * scale - ).squeeze(0)+lora_bias_stacked[i] - - Args: - y (ms.Tensor): Output tensor. Will be changed in-place. - x (ms.Tensor): Input tensor - lora_a_stacked (Tuple[ms.Tensor, ...]): lora_a's weight. - lora_b_stacked (Tuple[ms.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[Tuple[ms.Tensor, ...]]): lora's bias. - scale (float): Scaling factor. - output_slices (Tuple[int, ...]): Every slice's size. - buffer (Optional[Tuple[ms.Tensor, ...]]): Defaults to None. - """ - #No LoRA request, so return directly - if self.no_lora: - return - x = x.reshape(-1, x.shape[-1]) - assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) - if lora_bias_stacked is not None: - assert len(lora_bias_stacked) == len(output_slices) - y = self._apply_bias(self.token_lora_indices, y, output_slices, - lora_bias_stacked) - - if buffer is None: - r = lora_b_stacked[0].shape[-1] - # We set the buffer to be float32 by default, consistent with the - # triton op - buffer = tuple( - mint.zeros((x.shape[0], r), dtype=mstype.float32) - for _ in range(len(output_slices))) - self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) - self.add_expand(y, - buffer, - lora_b_stacked, - None, - output_slices, - add_inputs=True, - **kwargs) - def add_lora_logits(self, y, x, @@ -325,7 +161,7 @@ class PunicaWrapperNPU(PunicaWrapperBase): **kwargs) -> None: """ Applies lora specifically for LogitsProcessorWithLoRA. - + Semantics: buffer = (x @ lora_a_stacked) * scale y += buffer @ lora_b_stacked @@ -338,7 +174,7 @@ class PunicaWrapperNPU(PunicaWrapperBase): scale (float): Scaling factor. buffer (Optional[ms.Tensor]):Default to None. """ - #No LoRA request, so return directly + # No LoRA request, so return directly if self.no_lora: return y_org = y @@ -357,3 +193,30 @@ class PunicaWrapperNPU(PunicaWrapperBase): self.sampler_indices, add_inputs=True) y.view_as(y_org) + + def construct(self, + y, + x, + lora_a_stacked, + lora_b_stacked, + lora_bias_stacked, + scale, + **kwargs): + if self.no_lora: + return + x = x.reshape(-1, x.shape[-1]) + orign_shape = y.shape + y = y.reshape(-1, y.shape[-1]) + if lora_bias_stacked is not None: + selected_loras_bias = lora_bias_stacked[self.token_lora_indices] + y = ops.add(y, selected_loras_bias) + _, seq_len, lora_indices, _, _, _ = self.prefill_metadata + if get_model_context("is_prefill"): + outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale) + outputs = self.sgmv_expand_slice(outputs, lora_a_stacked, lora_indices) + else: + outputs = self.bgmv_shrink(x, lora_b_stacked, self.group_list, scale) + outputs = self.bgmv_expand_slice(outputs, lora_a_stacked, lora_indices) + outputs = ops.add(y, outputs) + outputs = outputs.reshape(orign_shape) + return outputs diff --git a/vllm_mindspore/lora/utils.py b/vllm_mindspore/lora/utils.py index d9157467..53cc6d41 100644 --- a/vllm_mindspore/lora/utils.py +++ b/vllm_mindspore/lora/utils.py @@ -22,7 +22,7 @@ from vllm.lora.fully_sharded_layers import ( RowParallelLinearWithShardedLoRA) # yapf conflicts with isort for this block -# yapf: disable # noqa: ERA001 +# yapf: disable from vllm_mindspore.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, LinearScalingRotaryEmbeddingWithLoRA, @@ -32,8 +32,9 @@ from vllm_mindspore.lora.layers import (BaseLayerWithLoRA, QKVParallelLinearWithLoRA, RowParallelLinearWithLoRA, VocabParallelEmbeddingWithLoRA) +from vllm_mindspore.model_executor.layers.quantization.sparse_quant_modelslim import A8W8SCLinearMethod -# yapf: enable # noqa: ERA001 +# yapf: enable _all_lora_classes: set[type[BaseLayerWithLoRA]] = { VocabParallelEmbeddingWithLoRA, @@ -50,3 +51,22 @@ _all_lora_classes: set[type[BaseLayerWithLoRA]] = { RowParallelLinearWithShardedLoRA, LinearScalingRotaryEmbeddingWithLoRA, } + +def replace_submodule(model, module_name, new_module): + """Replace a submodule in a model with a new module.""" + parent = model.get_submodule(".".join(module_name.split(".")[:-1])) + target_name = module_name.split(".")[-1] + setattr(parent, target_name, new_module) + new_module.base_layer.weight.name = module_name + ".weight" + new_module.lora_a_stacked.name = module_name + ".lora_a_weight" + new_module.lora_b_stacked.name = module_name + ".lora_b_weight" + if new_module.base_layer.bias is not None: + new_module.base_layer.bias.name = module_name + ".bias" + #new_module.lora_bias_stacked.name = module_name + ".lora_bias" + if isinstance(new_module.base_layer.quant_method, A8W8SCLinearMethod): + new_module.base_layer.index.name = module_name + ".index" + new_module.base_layer.input_scale.name = module_name + ".input_scale" + new_module.base_layer.input_offset.name = module_name + ".input_offset" + new_module.base_layer.deq_scale.name = module_name + ".deq_scale" + new_module.base_layer.quant_bias.name = module_name + ".quant_bias" + return new_module diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py index c8b6bc16..955e3f62 100644 --- a/vllm_mindspore/model_executor/model_loader/utils.py +++ b/vllm_mindspore/model_executor/model_loader/utils.py @@ -193,36 +193,7 @@ def get_ms_model_architecture( return model_cls, arch - -def convert_uint64_to_fp32(arr: np.ndarray): - arr_fp32 = arr.view(np.float32) - output = arr_fp32[:, :, 0::2] - return output - - -def np_int4data_pack_to_int8_3d(np_data): - np_data = np_data.astype(np.int8) - np_data &= 0x000F - np_data[::, ::, 0::2] <<= 0 - np_data[::, ::, 1::2] <<= 4 - np_int4_data = np_data[::, ::, 0::2] | np_data[::, ::, 1::2] - return np_int4_data - - -def unpack_int8_to_int4_3d(packed_data): - low_nibbles = (packed_data & 0x0F).astype(np.uint8) - high_nibbles = ((packed_data >> 4) & 0x0F).astype(np.uint8) - - unpacked = np.empty((*packed_data.shape[:2], packed_data.shape[2] * 2), - dtype=np.uint8) - unpacked[..., 0::2] = low_nibbles - unpacked[..., 1::2] = high_nibbles - - return unpacked - - -def process_weights_after_loading(model: nn.Module, model_config: ModelConfig, - target_device: torch.device) -> None: +def process_weights_after_loading(model, model_config) -> None: for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) if isinstance(quant_method, QuantizeMethodBase): -- Gitee From 0a287512604db05746c5c517134329bc88a58272 Mon Sep 17 00:00:00 2001 From: luolihao Date: Wed, 13 Aug 2025 14:55:00 +0800 Subject: [PATCH 09/14] fix bug and support lm_head to Nz --- .../distributed/communication_op.py | 29 ++++- .../model_executor/layers/logits_processor.py | 121 ++++++++++++++---- vllm_mindspore/model_executor/models/qwen2.py | 8 +- 3 files changed, 124 insertions(+), 34 deletions(-) diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py index 475a282d..604703dd 100644 --- a/vllm_mindspore/distributed/communication_op.py +++ b/vllm_mindspore/distributed/communication_op.py @@ -24,9 +24,10 @@ Implement a unified communication interface for both graph and pynative mode. from typing import Any, Dict, Optional, Union import torch -from mindspore import nn, ops +from mindspore import Tensor, mint, nn, ops from vllm.distributed.parallel_state import ( - get_tensor_model_parallel_world_size, get_tp_group) + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, + get_tp_group) def cpu_broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, @@ -74,3 +75,27 @@ class AllGatherFromModelParallelRegion(nn.Cell): output = self.all_gather_into_tensor(input_) output = ops.swapaxes(output, 0, -1) return output + + +class GatherFromModelParallelRegion(nn.Cell): + "Gather the input from model parallel region and concatenate." + + def __init__(self): + super().__init__() + self.world_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + if self.world_size > 1: + self.tp_group = get_tp_group().device_group._name + + def construct(self, + input_: Tensor, + dst: int = 0, + dim: int = -1) -> Optional[Tensor]: + # Size and dimension. + if self.world_size == 1: + return input_ + output = ops.CollectiveGather(dest_rank=dst, + group=self.tp_group)(mint.transpose(input_, 0, dim)) + if self.tp_rank != dst: + return None + return mint.transpose(output, 0, dim) \ No newline at end of file diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index ee8c8edc..6910804a 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -23,12 +23,14 @@ from concurrent.futures import ThreadPoolExecutor from typing import Optional import vllm.envs as envs -from mindspore import Tensor, mint, nn -from vllm.config import current_platform +from mindspore import Tensor, jit, mint, nn +from vllm.config import current_platform, get_current_vllm_config from vllm.distributed import (tensor_model_parallel_all_gather, tensor_model_parallel_gather) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm_mindspore.distributed.communication_op import ( + AllGatherFromModelParallelRegion, GatherFromModelParallelRegion) from vllm_mindspore.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -60,6 +62,9 @@ class LogitsProcessor(nn.Cell): scale: A scaling factor to apply to the logits. """ super().__init__() + vllm_config = get_current_vllm_config() + self.vllm_config = vllm_config + self.is_graph_mode = bool(not vllm_config.model_config.enforce_eager) self.scale = scale self.vocab_size = vocab_size # Whether the input is logits (default is hidden states). @@ -71,25 +76,101 @@ class LogitsProcessor(nn.Cell): # Whether to use gather or all-gather to gather the logits. self.use_all_gather = current_platform.use_all_gather() + if self.use_all_gather: + self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion() + else: + self.tensor_model_parallel_gather = GatherFromModelParallelRegion() + self.lm_head = None + self.run_model = None + self.cached_input_info = {} + + def set_dynamic_inputs(self): + dyn_hidden_states = Tensor(shape=[None, None], + dtype=self.vllm_config.model_config.dtype) + + if self.cached_input_info["indices"] is None: + dyn_indices = None + else: + dyn_indices_shape = [ + None for _ in range(self.cached_input_info["indices"]["ndim"]) + ] + dyn_indices_dtype = self.cached_input_info["indices"]["dtype"] + dyn_indices = Tensor(shape=dyn_indices_shape, + dtype=dyn_indices_dtype) + + if self.cached_input_info["bias"] is None: + dyn_bias = None + else: + dyn_bias_shape = [ + None for _ in range(self.cached_input_info["bias"]["ndim"]) + ] + dyn_bias_dtype = self.cached_input_info["bias"]["dtype"] + dyn_bias = Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype) + + self.set_inputs(dyn_hidden_states, dyn_indices, dyn_bias) + + def __call__( + self, + lm_head: VocabParallelEmbedding, + hidden_states: Tensor, + sampling_metadata: Optional[SamplingMetadata] = None, + embedding_bias: Optional[Tensor] = None, + ) -> Optional[Tensor]: + if self.lm_head is None: + self.lm_head = lm_head + if self.run_model is None: + self.run_model = jit( + function=self.construct, + jit_level='O0') if self.is_graph_mode else self.construct + selected_token_indices = None + if sampling_metadata is not None: + selected_token_indices = sampling_metadata.selected_token_indices + dyn_indices_info = None if selected_token_indices is None else { + "ndim": selected_token_indices.ndim, + "dtype": selected_token_indices.dtype, + } + dyn_bias_info = None if embedding_bias is None else { + "ndim": embedding_bias.ndim, + "dtype": embedding_bias.dtype, + } + if self.cached_input_info != {"indices": dyn_indices_info, + "bias": dyn_bias_info}: + self.cached_input_info = { + "indices": dyn_indices_info, + "bias": dyn_bias_info, + } + self.set_dynamic_inputs() + + logits = self.run_model( + hidden_states, + selected_token_indices, + embedding_bias + ) + + if sampling_metadata is not None and \ + sampling_metadata.seq_groups is not None: + logits = _apply_logits_processors(logits, sampling_metadata) + + return logits + def construct( self, - lm_head: VocabParallelEmbedding, hidden_states: Tensor, - sampling_metadata: Optional[SamplingMetadata] = None, + selected_token_indices: Optional[Tensor] = None, embedding_bias: Optional[Tensor] = None, ) -> Optional[Tensor]: if self.logits_as_input: logits = hidden_states else: - if sampling_metadata is not None: - if sampling_metadata.selected_token_indices.numel() <= 0: - return mint.zeros((0, self.vocab_size), - dtype=hidden_states.dtype) - hidden_states = _prune_hidden_states(hidden_states, - sampling_metadata) + if selected_token_indices is not None: + if selected_token_indices.numel() <= 0: + return mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype) + hidden_states = mint.index_select( + hidden_states, 0, selected_token_indices) # Get the logits for the next tokens. - logits = self._get_logits(hidden_states, lm_head, embedding_bias) + logits = self._get_logits( + hidden_states, self.lm_head, embedding_bias) if logits is not None: if self.soft_cap is not None: logits = logits / self.soft_cap @@ -100,9 +181,6 @@ class LogitsProcessor(nn.Cell): logits *= self.scale # Apply logits processors (if any). - if sampling_metadata is not None and \ - sampling_metadata.seq_groups is not None: - logits = _apply_logits_processors(logits, sampling_metadata) return logits @@ -118,10 +196,10 @@ class LogitsProcessor(nn.Cell): bias=embedding_bias) if self.use_all_gather: # Gather is not supported for some devices such as NPUs. - logits = tensor_model_parallel_all_gather(logits) + logits = self.tensor_model_parallel_all_gather(logits) else: # None may be returned for rank > 0 - logits = tensor_model_parallel_gather(logits) + logits = self.tensor_model_parallel_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[..., :self.org_vocab_size] @@ -134,17 +212,6 @@ class LogitsProcessor(nn.Cell): return s -def _prune_hidden_states( - hidden_states: Tensor, - sampling_metadata: SamplingMetadata, -) -> Tensor: - indices = sampling_metadata.selected_token_indices - if indices is not None and indices.numel() > 0: - return mint.index_select(hidden_states, 0, - sampling_metadata.selected_token_indices) - return hidden_states - - def _apply_logits_processors( logits: Tensor, sampling_metadata: SamplingMetadata, diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index eff9be31..da4a2238 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -379,6 +379,7 @@ class Qwen2Model(nn.Cell): continue param = params_dict[name] + loaded_weight = ms.Tensor(loaded_weight[:], dtype=param.dtype) param.set_data(loaded_weight.contiguous()) def adjust_weight(params_dict): @@ -390,7 +391,7 @@ class Qwen2Model(nn.Cell): "o_proj.weight", "gate_up_proj.weight", "down_proj.weight", - # "lm_head.weight", + "lm_head.weight", ] rank_id = get_rank() @@ -460,9 +461,6 @@ class Qwen2Model(nn.Cell): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - # Norm type in weights may be f32 - if(loaded_weight.dtype != param.dtype): - loaded_weight = loaded_weight.to(dtype=param.dtype) weight_loader(param, loaded_weight) loaded_params.add(name) @@ -475,7 +473,7 @@ class Qwen2Model(nn.Cell): "o_proj.weight", "gate_up_proj.weight", "down_proj.weight", - # "lm_head.weight", + "lm_head.weight", ] for name, param in params_dict.items(): -- Gitee From 3414ad00c1564f6dd1b0a56dfa29795e430641d2 Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Sat, 23 Aug 2025 10:00:32 +0800 Subject: [PATCH 10/14] fix conflict --- vllm_mindspore/__init__.py | 3 +- vllm_mindspore/lora/layers.py | 91 ++++++++++--------- .../lora/punica_wrapper/punica_npu.py | 13 +-- .../model_executor/model_loader/utils.py | 2 +- .../model_executor/models/model_base.py | 5 +- vllm_mindspore/v1/worker/gpu_model_runner.py | 10 +- 6 files changed, 60 insertions(+), 64 deletions(-) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 7073717d..1f404e28 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -125,10 +125,11 @@ vllm.utils.memory_profiling = ms_memory_profiling import vllm.lora.utils from vllm_mindspore.model_executor.layers.linear import LinearBase -from vllm_mindspore.lora.utils import _all_lora_classes +from vllm_mindspore.lora.utils import _all_lora_classes, replace_submodule vllm.lora.utils._all_lora_classes = _all_lora_classes vllm.lora.utils.LinearBase = LinearBase +vllm.lora.utils.replace_submodule = replace_submodule import vllm.lora.models from vllm_mindspore.lora.models import ( diff --git a/vllm_mindspore/lora/layers.py b/vllm_mindspore/lora/layers.py index 7d565138..f5749280 100644 --- a/vllm_mindspore/lora/layers.py +++ b/vllm_mindspore/lora/layers.py @@ -362,37 +362,11 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): lora_b = self.slice_lora_b(lora_b) if lora_bias is not None: lora_bias = self.slice_bias(lora_bias) - if self.n_slices == 3: - self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0] - self.lora_a_stacked[index, :, lora_a[0].shape[1] : lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1] - self.lora_a_stacked[index, :, lora_a[0].shape[1] + lora_a[1].shape[1] :] = lora_a[2] - self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0] - self.lora_b_stacked[index, lora_b[0].shape[0] : lora_b[0].shape[0] + lora_b[1].shape[0], lora_b[0].shape[1] : lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1] - self.lora_b_stacked[index, lora_b[0].shape[0] + lora_b[1].shape[0] :, lora_b[0].shape[1] + lora_b[1].shape[1] :] = lora_b[2] - if self.lora_bias_stacked is not None: - assert len(self.lora_bias_stacked) - lora_bias = ops.concat(lora_bias, axis=0) - self.lora_bias_stacked[index] = lora_bias - elif self.n_slices == 2: - self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0] - self.lora_a_stacked[index, :, lora_a[0].shape[1]: lora_a[0].shape[1] + lora_a[1].shape[1]] = lora_a[1] - self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0] - self.lora_b_stacked[index, lora_b[0].shape[0]: lora_b[0].shape[0] + lora_b[1].shape[0], - lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1] - if self.lora_bias_stacked is not None: - assert len(self.lora_bias_stacked) - lora_bias = ops.concat(lora_bias, axis=0) - self.lora_bias_stacked[index] = lora_bias - if self.lora_bias_stacked is not None: - assert len(self.lora_bias_stacked) - lora_bias = ops.concat(lora_bias, axis=0) - self.lora_bias_stacked[index] = lora_bias - else: - self.lora_a_stacked[index] = lora_a - self.lora_b_stacked[index] = lora_b - if self.lora_bias_stacked is not None: - assert len(self.lora_bias_stacked) - self.lora_bias_stacked[index] = lora_bias + self.lora_a_stacked[index, :, :lora_a.shape[1]] = lora_a + self.lora_b_stacked[index, :lora_b.shape[0], :] = lora_b + if self.lora_bias_stacked is not None: + assert len(self.lora_bias_stacked) + self.lora_bias_stacked[index] = lora_bias def apply(self, x: ms.Tensor, @@ -594,20 +568,16 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): lora_b = self.slice_lora_b(lora_b) if lora_bias is not None: lora_bias = self.slice_bias(lora_bias) - lora_weight_list = [] - for i in range(self.n_slices): - if (lora_a_i := lora_a[i]) is not None and (lora_b_i := lora_b[i]) is not None: - lora_weight_list.append(ops.matmul(lora_a_i.transpose(1, 0), lora_b_i.transpose(1, 0))) - lora_weight = ops.cat(tuple(lora_weight_list), axis=-1) - self.lora_weight[index] = lora_weight - - lora_bias_list = [] - if lora_bias is not None: - for i in range(self.n_slices): - if (lora_bias_i := lora_bias[i]) is not None: - lora_bias_list.appendd(lora_bias_i) - lora_bias = ops.cat(tuple(lora_weight_list), axis=0) - self.lora_bias_stacked[index] = lora_bias + self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0] + self.lora_a_stacked[index, :, self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_a[1].shape[1]] = lora_a[1] + self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0] + self.lora_b_stacked[index, self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_b[1].shape[0], + lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1] + if self.lora_bias_stacked is not None: + assert len(self.lora_bias_stacked) + lora_bias = ops.concat(lora_bias, axis=0) + self.lora_bias_stacked[index] = lora_bias + @classmethod @_not_fully_sharded_can_replace @@ -726,6 +696,37 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): self.kv_shard_id, ) + def set_lora( + self, + index: int, + lora_a: ms.Tensor, + lora_b: ms.Tensor, + embeddings_tensor: Optional[ms.Tensor], + lora_bias: Optional[ms.Tensor] = None, + ): + self.reset_lora(index) + + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) + self.lora_a_stacked[index, :, : lora_a[0].shape[1]] = lora_a[0] + self.lora_a_stacked[index, :, + self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_a[1].shape[1]] = lora_a[1] + self.lora_a_stacked[index, :, + self.lora_config.max_lora_rank * 2: self.lora_config.max_lora_rank * 2 + lora_a[2].shape[1]] = lora_a[2] + self.lora_b_stacked[index, :lora_b[0].shape[0], :lora_b[0].shape[1]] = lora_b[0] + self.lora_b_stacked[index, self.lora_config.max_lora_rank: self.lora_config.max_lora_rank + lora_b[1].shape[0], + lora_b[0].shape[1]: lora_b[0].shape[1] + lora_b[1].shape[1]] = lora_b[1] + self.lora_b_stacked[index, + self.lora_config.max_lora_rank * 2: self.lora_config.max_lora_rank * 2 + lora_b[2].shape[0], + lora_b[0].shape[1] + lora_b[1].shape[1]:] = lora_b[2] + if self.lora_bias_stacked is not None: + assert len(self.lora_bias_stacked) + lora_bias = ops.concat(lora_bias, axis=0) + self.lora_bias_stacked[index] = lora_bias + @classmethod @_not_fully_sharded_can_replace def can_replace_layer( diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py index d9295f66..d5a1394c 100644 --- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py +++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py @@ -70,8 +70,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): scaling=1.0): selected_loras = lora_a_weights[lora_indices_tensor] inputs = inputs.astype(lora_a_weights[0].dtype) - selected_loras = selected_loras.squeeze(1) - outputs = einsum_ms(inputs, selected_loras) + outputs = ops.matmul(inputs.unsqueeze(1), selected_loras).squeeze(1) return scaling * outputs def sgmv_expand_slice(self, @@ -92,8 +91,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): lora_indices_tensor): selected_loras = lora_b_weights[lora_indices_tensor] inputs = inputs.astype(lora_b_weights[0].dtype) - selected_loras = selected_loras.squeeze(1) - outputs = einsum_ms(inputs, selected_loras) + outputs = ops.matmul(inputs.unsqueeze(1), selected_loras).squeeze(1) return outputs def update_metadata( @@ -210,13 +208,12 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): if lora_bias_stacked is not None: selected_loras_bias = lora_bias_stacked[self.token_lora_indices] y = ops.add(y, selected_loras_bias) - _, seq_len, lora_indices, _, _, _ = self.prefill_metadata if get_model_context("is_prefill"): outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale) - outputs = self.sgmv_expand_slice(outputs, lora_a_stacked, lora_indices) + outputs = self.sgmv_expand_slice(outputs, lora_b_stacked, self.group_list) else: - outputs = self.bgmv_shrink(x, lora_b_stacked, self.group_list, scale) - outputs = self.bgmv_expand_slice(outputs, lora_a_stacked, lora_indices) + outputs = self.bgmv_shrink(x, lora_a_stacked, self.token_lora_indices, scale) + outputs = self.bgmv_expand_slice(outputs, lora_b_stacked, self.token_lora_indices) outputs = ops.add(y, outputs) outputs = outputs.reshape(orign_shape) return outputs diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py index 955e3f62..0ace6cea 100644 --- a/vllm_mindspore/model_executor/model_loader/utils.py +++ b/vllm_mindspore/model_executor/model_loader/utils.py @@ -193,7 +193,7 @@ def get_ms_model_architecture( return model_cls, arch -def process_weights_after_loading(model, model_config) -> None: +def process_weights_after_loading(model, model_config, target_device) -> None: for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) if isinstance(quant_method, QuantizeMethodBase): diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index dd28a8c6..78f24d62 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -160,6 +160,7 @@ class MsModelBase: config = vllm_config.model_config.hf_config lora_config = vllm_config.lora_config + self.vllm_config = vllm_config self.config = config self.model_config = vllm_config.model_config self.lora_config = lora_config @@ -412,9 +413,9 @@ class NativeModel(MsModelBase): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__(vllm_config=vllm_config, prefix=prefix) self.quant_config = vllm_config.quant_config - if vllm_config.lora_config is not None: + #if vllm_config.lora_config is not None: # native model lora only support pynative mode now - vllm_config.model_config.enforce_eager = True + # vllm_config.model_config.enforce_eager = True self.is_eager_mode = vllm_config.model_config.enforce_eager self.prefill_graph = None self.decode_graph = None diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 56a8a120..e5ffef5e 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -44,7 +44,6 @@ from vllm_mindspore.utils import (create_kv_cache, get_dtype_size, get_valid_dtype, is_310p, FORMAT_TYPE) from vllm_mindspore.v1.kv_cache_interface import MLAQuantFullAttentionSpec - logger = init_logger(__name__) @@ -402,11 +401,11 @@ def _reshape_kv_cache_tensors( Reshape the KV cache tensors to the desired shape and dtype. Args: - kv_cache_config: The KV cache config - kv_cache_raw_tensors: The KV cache buffer of each layer, with + kv_cache_config: The KV cache config + kv_cache_raw_tensors: The KV cache buffer of each layer, with correct size but uninitialized shape. Returns: - Dict[str, Tensor]: A map between layer names to their + Dict[str, Tensor]: A map between layer names to their corresponding memory buffer for KV cache. """ # Determine whether deepseek use mla op @@ -444,9 +443,6 @@ def _reshape_kv_cache_tensors( kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) - if is_310p(): - *dims, second_last, last = kv_cache_shape - kv_cache_shape = (*dims, second_last * last) try: kv_cache_stride_order = self.attn_backends[ i].get_kv_cache_stride_order() -- Gitee From 5d72f0328d44b617c9a2074f91bf45307a2a086a Mon Sep 17 00:00:00 2001 From: zhanzhan1 Date: Tue, 5 Aug 2025 19:40:26 +0800 Subject: [PATCH 11/14] lm_head support jit --- .../distributed/communication_op.py | 4 +- .../model_executor/layers/logits_processor.py | 47 ++++++++----------- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py index 604703dd..86a0d797 100644 --- a/vllm_mindspore/distributed/communication_op.py +++ b/vllm_mindspore/distributed/communication_op.py @@ -23,7 +23,6 @@ Implement a unified communication interface for both graph and pynative mode. from typing import Any, Dict, Optional, Union import torch - from mindspore import Tensor, mint, nn, ops from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -98,4 +97,5 @@ class GatherFromModelParallelRegion(nn.Cell): group=self.tp_group)(mint.transpose(input_, 0, dim)) if self.tp_rank != dst: return None - return mint.transpose(output, 0, dim) \ No newline at end of file + return mint.transpose(output, 0, dim) + diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index 6910804a..1b1770cc 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -75,7 +75,7 @@ class LogitsProcessor(nn.Cell): self.soft_cap = soft_cap # Whether to use gather or all-gather to gather the logits. self.use_all_gather = current_platform.use_all_gather() - + if self.use_all_gather: self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion() else: @@ -85,36 +85,28 @@ class LogitsProcessor(nn.Cell): self.cached_input_info = {} def set_dynamic_inputs(self): - dyn_hidden_states = Tensor(shape=[None, None], - dtype=self.vllm_config.model_config.dtype) - - if self.cached_input_info["indices"] is None: - dyn_indices = None - else: - dyn_indices_shape = [ - None for _ in range(self.cached_input_info["indices"]["ndim"]) - ] - dyn_indices_dtype = self.cached_input_info["indices"]["dtype"] - dyn_indices = Tensor(shape=dyn_indices_shape, - dtype=dyn_indices_dtype) - - if self.cached_input_info["bias"] is None: - dyn_bias = None - else: - dyn_bias_shape = [ - None for _ in range(self.cached_input_info["bias"]["ndim"]) - ] - dyn_bias_dtype = self.cached_input_info["bias"]["dtype"] - dyn_bias = Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype) + dyn_hidden_states = Tensor( + shape=[None, None], dtype=self.vllm_config.model_config.dtype) + + dyn_indices_shape = [None for _ in range( + self.cached_input_info["indices"]["ndim"])] + dyn_indices_dtype = self.cached_input_info["indices"]["dtype"] + dyn_indices = None if self.cached_input_info["indices"] is None else \ + Tensor(shape=dyn_indices_shape, dtype=dyn_indices_dtype) + + dyn_bias_shape = [None for _ in range( + self.cached_input_info["bias"]["ndim"])] + dyn_bias_dtype = self.cached_input_info["bias"]["dtype"] + dyn_bias = None if self.cached_input_info["bias"] is None else \ + Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype) self.set_inputs(dyn_hidden_states, dyn_indices, dyn_bias) def __call__( - self, - lm_head: VocabParallelEmbedding, - hidden_states: Tensor, - sampling_metadata: Optional[SamplingMetadata] = None, - embedding_bias: Optional[Tensor] = None, + self, + hidden_states: Tensor, + selected_token_indices: Optional[Tensor] = None, + embedding_bias: Optional[Tensor] = None, ) -> Optional[Tensor]: if self.lm_head is None: self.lm_head = lm_head @@ -181,7 +173,6 @@ class LogitsProcessor(nn.Cell): logits *= self.scale # Apply logits processors (if any). - return logits def _get_logits( -- Gitee From ff2500ef32eb3ae10b4605c0f42b1ef420342618 Mon Sep 17 00:00:00 2001 From: zhanzhan1 Date: Tue, 5 Aug 2025 20:14:33 +0800 Subject: [PATCH 12/14] bugfix --- .../model_executor/layers/logits_processor.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index 1b1770cc..27c19106 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -85,20 +85,27 @@ class LogitsProcessor(nn.Cell): self.cached_input_info = {} def set_dynamic_inputs(self): - dyn_hidden_states = Tensor( - shape=[None, None], dtype=self.vllm_config.model_config.dtype) - - dyn_indices_shape = [None for _ in range( - self.cached_input_info["indices"]["ndim"])] - dyn_indices_dtype = self.cached_input_info["indices"]["dtype"] - dyn_indices = None if self.cached_input_info["indices"] is None else \ - Tensor(shape=dyn_indices_shape, dtype=dyn_indices_dtype) + dyn_hidden_states = Tensor(shape=[None, None], + dtype=self.vllm_config.model_config.dtype) + + if self.cached_input_info["indices"] is None: + dyn_indices = None + else: + dyn_indices_shape = [ + None for _ in range(self.cached_input_info["indices"]["ndim"]) + ] + dyn_indices_dtype = self.cached_input_info["indices"]["dtype"] + dyn_indices = Tensor(shape=dyn_indices_shape, + dtype=dyn_indices_dtype) - dyn_bias_shape = [None for _ in range( - self.cached_input_info["bias"]["ndim"])] - dyn_bias_dtype = self.cached_input_info["bias"]["dtype"] - dyn_bias = None if self.cached_input_info["bias"] is None else \ - Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype) + if self.cached_input_info["bias"] is None: + dyn_bias = None + else: + dyn_bias_shape = [ + None for _ in range(self.cached_input_info["bias"]["ndim"]) + ] + dyn_bias_dtype = self.cached_input_info["bias"]["dtype"] + dyn_bias = Tensor(shape=dyn_bias_shape, dtype=dyn_bias_dtype) self.set_inputs(dyn_hidden_states, dyn_indices, dyn_bias) -- Gitee From d4d6e4ae54426122160e0d8796759815ba2b6b74 Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Mon, 25 Aug 2025 11:00:32 +0800 Subject: [PATCH 13/14] qwen2.5 enable V0 --- vllm_mindspore/__init__.py | 3 -- vllm_mindspore/config.py | 4 -- .../lora/punica_wrapper/punica_npu.py | 54 +++++++++---------- .../model_executor/layers/logits_processor.py | 39 +++++++------- .../quantization/smooth_quant_modelslim.py | 7 +-- .../model_executor/models/model_base.py | 4 +- 6 files changed, 53 insertions(+), 58 deletions(-) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 1f404e28..3dac7aca 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -300,7 +300,6 @@ from .config import ( _verify_quantization, _verify_args, vllm_config_post_init, - vllm_config_get_quantization_config, model_post_init, _get_and_verify_dtype, stateless_init_dp_group, @@ -309,8 +308,6 @@ from .config import ( vllm.config.ModelConfig._verify_quantization = _verify_quantization vllm.config.VllmConfig.__post_init__ = vllm_config_post_init -vllm.config.VllmConfig._get_quantization_config = staticmethod( - vllm_config_get_quantization_config) vllm.config.SchedulerConfig._verify_args = _verify_args vllm.config.CompilationConfig.model_post_init = model_post_init vllm.config._get_and_verify_dtype = _get_and_verify_dtype diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index 5f7da8c5..c9c16690 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -46,10 +46,6 @@ def _verify_quantization(self) -> None: return -def vllm_config_get_quantization_config(model_config, load_config): - return None - - def vllm_config_post_init(self): """Verify configs are valid & consistent with each other.""" if self.model_config is not None: diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py index d5a1394c..76e46a29 100644 --- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py +++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py @@ -24,12 +24,12 @@ from typing import Callable, Optional from mindspore import mint, nn, Parameter, ops, dtype from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer -from mindspore.ops.auto_generate import grouped_matmul_v4 +from mindspore.ops.auto_generate import grouped_matmul_v4, GroupedMatmul from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase - +import vllm.envs as envs from vllm_mindspore.lora.ops.torch_ops.lora_ops import ( bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, - sgmv_expand_slice, sgmv_shrink, sort_lora_by_token_count, einsum_ms) + sgmv_expand_slice, sgmv_shrink, einsum_ms) from vllm_mindspore.model_executor.utils import get_model_context @@ -47,21 +47,22 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) self.max_loras = kwargs["max_loras"] + self.gmm = GroupedMatmul(split_item=3, group_type=0) self.group_list = Parameter(initializer("ones", self.max_loras, dtype.int64), name="group_list") self.lora_indices = Parameter(initializer("ones", self.max_loras, dtype.int64), name="lora_indices") def sgmv_shrink(self, - inputs, - lora_a_weights, - group_list, - scaling, - ): + inputs, + lora_a_weights, + group_list, + scaling, + ): outputs = grouped_matmul_v4([inputs], [lora_a_weights], group_list=group_list, split_item=3, group_type=0, group_list_type=1)[0] - return outputs * scaling + return outputs def bgmv_shrink(self, inputs, @@ -71,7 +72,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): selected_loras = lora_a_weights[lora_indices_tensor] inputs = inputs.astype(lora_a_weights[0].dtype) outputs = ops.matmul(inputs.unsqueeze(1), selected_loras).squeeze(1) - return scaling * outputs + return outputs def sgmv_expand_slice(self, inputs, @@ -106,21 +107,16 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size, long_lora_context) + self._update_prefill_metadata(self.token_lora_indices) if mapping.is_prefill: # Update metadata required for prefill-related operators. - self._update_prefill_metadata(self.token_lora_indices) self.is_prefill = True else: self.is_prefill = False _, seq_len, lora_indices, _, _, _ = self.prefill_metadata - sorted_ids, sorted_counts = sort_lora_by_token_count( - lora_indices, seq_len) - group_list = sorted_counts - if len(group_list) < self.max_loras: - new_tensor = mint.zeros(self.max_loras, dtype=group_list.dtype) - new_tensor[:group_list.size(0)] = group_list - group_list = new_tensor - self.group_list.set_data(group_list.astype(dtype.int64)) + new_tensor = ops.zeros(self.max_loras, dtype=self.group_list.dtype) + new_tensor[lora_indices] = seq_len + self.group_list.set_data(new_tensor.astype(dtype.int64)) def add_lora_embedding(self, y, @@ -182,7 +178,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): if buffer is None: # We set the buffer to be float32 by default, consistent with the # triton op - buffer = mint.zeros((x.shape[0], r), dtype=mstype.float32) + buffer = ops.zeros((x.shape[0], r), dtype=mstype.float32) # LogitsProcessorWithLoRA always using bgmv. bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) bgmv_expand(buffer, @@ -201,19 +197,23 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): scale, **kwargs): if self.no_lora: - return + return y x = x.reshape(-1, x.shape[-1]) orign_shape = y.shape y = y.reshape(-1, y.shape[-1]) if lora_bias_stacked is not None: selected_loras_bias = lora_bias_stacked[self.token_lora_indices] y = ops.add(y, selected_loras_bias) - if get_model_context("is_prefill"): - outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale) - outputs = self.sgmv_expand_slice(outputs, lora_b_stacked, self.group_list) + if not envs.VLLM_USE_V1: + shrink_outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale) + expand_outputs = self.sgmv_expand_slice(shrink_outputs, lora_b_stacked, self.group_list) else: - outputs = self.bgmv_shrink(x, lora_a_stacked, self.token_lora_indices, scale) - outputs = self.bgmv_expand_slice(outputs, lora_b_stacked, self.token_lora_indices) - outputs = ops.add(y, outputs) + if get_model_context("is_prefill"): + shrink_outputs = self.sgmv_shrink(x, lora_a_stacked, self.group_list, scale) + expand_outputs = self.sgmv_expand_slice(shrink_outputs, lora_b_stacked, self.group_list) + else: + shrink_outputs = self.bgmv_shrink(x, lora_a_stacked, self.token_lora_indices, scale) + expand_outputs = self.bgmv_expand_slice(shrink_outputs, lora_b_stacked, self.token_lora_indices) + outputs = ops.add(y, expand_outputs) outputs = outputs.reshape(orign_shape) return outputs diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index 27c19106..6f829079 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -76,10 +76,10 @@ class LogitsProcessor(nn.Cell): # Whether to use gather or all-gather to gather the logits. self.use_all_gather = current_platform.use_all_gather() - if self.use_all_gather: - self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion() - else: - self.tensor_model_parallel_gather = GatherFromModelParallelRegion() + # if self.use_all_gather: + self.tensor_model_parallel_all_gather = AllGatherFromModelParallelRegion() + # else: + # self.tensor_model_parallel_gather = GatherFromModelParallelRegion() self.lm_head = None self.run_model = None self.cached_input_info = {} @@ -111,8 +111,9 @@ class LogitsProcessor(nn.Cell): def __call__( self, + lm_head: VocabParallelEmbedding, hidden_states: Tensor, - selected_token_indices: Optional[Tensor] = None, + sampling_metadata: Optional[SamplingMetadata] = None, embedding_bias: Optional[Tensor] = None, ) -> Optional[Tensor]: if self.lm_head is None: @@ -139,12 +140,14 @@ class LogitsProcessor(nn.Cell): "bias": dyn_bias_info, } self.set_dynamic_inputs() - - logits = self.run_model( - hidden_states, - selected_token_indices, - embedding_bias - ) + if selected_token_indices is not None and selected_token_indices.numel() <= 0: + logits = mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype) + else: + logits = self.run_model( + hidden_states, + selected_token_indices, + embedding_bias + ) if sampling_metadata is not None and \ sampling_metadata.seq_groups is not None: @@ -157,13 +160,11 @@ class LogitsProcessor(nn.Cell): hidden_states: Tensor, selected_token_indices: Optional[Tensor] = None, embedding_bias: Optional[Tensor] = None, - ) -> Optional[Tensor]: + ) -> Optional[Tensor]: if self.logits_as_input: logits = hidden_states else: if selected_token_indices is not None: - if selected_token_indices.numel() <= 0: - return mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype) hidden_states = mint.index_select( hidden_states, 0, selected_token_indices) @@ -192,12 +193,12 @@ class LogitsProcessor(nn.Cell): logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias) - if self.use_all_gather: + # if self.use_all_gather: # Gather is not supported for some devices such as NPUs. - logits = self.tensor_model_parallel_all_gather(logits) - else: - # None may be returned for rank > 0 - logits = self.tensor_model_parallel_gather(logits) + logits = self.tensor_model_parallel_all_gather(logits) + # else: + # # None may be returned for rank > 0 + # logits = self.tensor_model_parallel_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[..., :self.org_vocab_size] diff --git a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py index cad2f322..6003a667 100644 --- a/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py +++ b/vllm_mindspore/model_executor/layers/quantization/smooth_quant_modelslim.py @@ -18,6 +18,7 @@ from typing import Any, Optional import mindspore import numpy as np +import regex as re from mindspore import Parameter, Tensor, ops from mindspore.common.initializer import initializer from mindspore.ops.auto_generate import (DynamicQuantExt, GroupedMatmul, @@ -157,6 +158,7 @@ class A8W8LinearMethod(LinearMethodBase): def __init__(self, quant_config: SmoothQuantModelSlimConfig): self.quant_config = quant_config self.quant = QuantV2() + self.bias_add = ops.Add() def create_weights(self, layer: mindspore.nn.Cell, @@ -374,8 +376,6 @@ class A8W8LinearMethod(LinearMethodBase): else: qx = self.matmul(qx, weight, deq_scale, None, layer.quant_bias, None) - if bias is not None: - qx = self.bias_add(qx, bias) qx = qx.reshape(output_shape) return qx @@ -386,6 +386,7 @@ class A8W8DYNLinearMethod(LinearMethodBase): def __init__(self, quant_config: SmoothQuantModelSlimConfig): self.quant_config = quant_config self.quant = DynamicQuantExt() + self.bias_add = ops.Add() def create_weights(self, layer: mindspore.nn.Cell, @@ -508,6 +509,6 @@ class A8W8DYNLinearMethod(LinearMethodBase): else: qx = self.matmul(qx, weight, weight_scale, None, None, qx_scale) if bias is not None: - qx = mint.add(qx, bias) + qx = ops.add(qx, bias) qx = qx.reshape(output_shape) return qx diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 78f24d62..b149e94b 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -397,8 +397,8 @@ class MsModelBase: model_inputs = {} model_inputs["input_ids"] = input_ids model_inputs["batch_valid_length"] = ms.from_numpy(seq_lens_np) - model_inputs["block_tables"] = attn_metadata.block_tables - model_inputs["slot_mapping"] = attn_metadata.slot_mapping + model_inputs["block_tables"] = attn_metadata.block_tables + 0 + model_inputs["slot_mapping"] = attn_metadata.slot_mapping + 0 model_inputs["position_ids"] = position_ids model_inputs["q_seq_lens"] = q_seq_lens model_inputs["attention_mask"] = attention_mask -- Gitee From 0e7e072fd1f285b0be2fb823c6681163ca5dd0fd Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Wed, 3 Sep 2025 15:11:20 +0800 Subject: [PATCH 14/14] v0 add new graph for lora --- vllm_mindspore/__init__.py | 2 + .../lora/punica_wrapper/punica_npu.py | 15 +- .../models/mf_models/mf_model_base.py | 217 +++++++++++ .../models/mf_models/weight_processor.py | 342 ------------------ .../model_executor/models/model_base.py | 82 ++++- vllm_mindspore/model_executor/utils.py | 3 +- vllm_mindspore/worker/model_runner.py | 14 +- vllm_mindspore/worker/worker.py | 32 +- 8 files changed, 339 insertions(+), 368 deletions(-) create mode 100644 vllm_mindspore/model_executor/models/mf_models/mf_model_base.py delete mode 100644 vllm_mindspore/model_executor/models/mf_models/weight_processor.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 3dac7aca..e23fe457 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -232,12 +232,14 @@ V0Worker.init_device = wrapper_worker_init_device(V0Worker.init_device) from vllm_mindspore.worker.model_runner import ( _get_cuda_graph_pad_size, _dummy_run, + profile_run, _get_supported_attention_backends, ) vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = ( _get_cuda_graph_pad_size) vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run +vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run import vllm.worker.multi_step_model_runner diff --git a/vllm_mindspore/lora/punica_wrapper/punica_npu.py b/vllm_mindspore/lora/punica_wrapper/punica_npu.py index 76e46a29..f0ad5642 100644 --- a/vllm_mindspore/lora/punica_wrapper/punica_npu.py +++ b/vllm_mindspore/lora/punica_wrapper/punica_npu.py @@ -30,7 +30,7 @@ import vllm.envs as envs from vllm_mindspore.lora.ops.torch_ops.lora_ops import ( bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, sgmv_expand_slice, sgmv_shrink, einsum_ms) -from vllm_mindspore.model_executor.utils import get_model_context +from vllm_mindspore.model_executor.utils import get_model_context, set_model_context # The platforms that are compatible with the PyTorch-native implementation can @@ -52,11 +52,11 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): self.lora_indices = Parameter(initializer("ones", self.max_loras, dtype.int64), name="lora_indices") def sgmv_shrink(self, - inputs, - lora_a_weights, - group_list, - scaling, - ): + inputs, + lora_a_weights, + group_list, + scaling, + ): outputs = grouped_matmul_v4([inputs], [lora_a_weights], group_list=group_list, split_item=3, @@ -117,6 +117,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): new_tensor = ops.zeros(self.max_loras, dtype=self.group_list.dtype) new_tensor[lora_indices] = seq_len self.group_list.set_data(new_tensor.astype(dtype.int64)) + set_model_context("no_lora", self.no_lora) def add_lora_embedding(self, y, @@ -196,7 +197,7 @@ class PunicaWrapperNPU(PunicaWrapperBase, nn.Cell): lora_bias_stacked, scale, **kwargs): - if self.no_lora: + if get_model_context("no_lora"): return y x = x.reshape(-1, x.shape[-1]) orign_shape = y.shape diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py new file mode 100644 index 00000000..20969730 --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -0,0 +1,217 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from collections.abc import Iterable +from typing import Optional, Union + +import mindspore as ms +from mindformers.core.context import build_mf_context +from mindformers.core.parallel_config import build_parallel_config +from mindformers.tools.register.config import MindFormerConfig +from mindformers.tools.utils import is_pynative +from mindspore import Tensor, nn +from mindspore.common.api import _pynative_executor +from mindspore.communication import get_rank +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_dp_group +from vllm.forward_context import get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from vllm_mindspore.model_executor.models.attention_mask import ( + LowerTriangularMask) +from vllm_mindspore.model_executor.models.model_base import MsModelBase +from vllm_mindspore.model_executor.models.utils import is_use_ringmla + +try: + # Need to apply dllm pd patch on vllm to use pd disagg related functions + from vllm.attention.layer import (maybe_save_kv_layer_to_connector, + wait_for_kv_layer_from_connector) + from vllm.distributed.kv_transfer import is_v1_kv_transfer_group + kv_transfer_supported = True +except: # noqa: E722 + kv_transfer_supported = False + +logger = init_logger(__name__) + + +class MfModelBase(MsModelBase): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix) + + model_config_path = os.getenv("MINDFORMERS_MODEL_CONFIG") + if model_config_path is None: + raise RuntimeError('For "MindFormers" model backend, environments ' + 'MINDFORMERS_MODEL_CONFIG should be set!') + + self.mf_config = MindFormerConfig(model_config_path) + self.rank_id = get_rank() + self.dp_size = get_dp_group() + + self.kv_transfer_config = vllm_config.kv_transfer_config + build_mf_context(self.mf_config) + build_parallel_config(self.mf_config) + self.mf_config.model.model_config.parallel_config = ( + self.mf_config.parallel_config) + self.mf_config.model.model_config.parallel_config.model_parallel = ( + get_tensor_model_parallel_world_size()) + self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 + self.use_ringmla = is_use_ringmla(vllm_config, self.mf_config) + self.is_chunked = False + self._generate_model_config() + if not hasattr(self, 'mf_model_config'): + raise RuntimeError('mf_model_config not initialized') + self.casual_mask = LowerTriangularMask( + dtype=self.mf_model_config.compute_dtype, + max_model_len=self.model_config.max_model_len) + self.network, self.lm_head = self._create_network() + + affinity_config = self.mf_config.get('context', + {}).get('affinity_cpu_list', {}) + if isinstance(affinity_config, dict): + ms.runtime.set_cpu_affinity(True, affinity_config) + + self._set_dynamic_inputs() + + @property + def ready_lm_head(self) -> nn.Cell: + if self.lm_head is None: + raise RuntimeError("lm_head not initialized") + return self.lm_head + + @abstractmethod + def _generate_model_config(self): + raise NotImplementedError( + "Function _generate_model_config should be Implemented!") + + @abstractmethod + def _create_network(self): + raise NotImplementedError( + "Function _create_network should be Implemented!") + + # DLLM + def is_decoder_task(self) -> bool: + if self.kv_transfer_config is None: + return False + + return self.kv_transfer_config.is_kv_consumer + + # DLLM + def is_prefill_task(self) -> bool: + if self.kv_transfer_config is None: + return False + + return self.kv_transfer_config.is_kv_producer + + def _set_dynamic_inputs(self): + self.network.set_dynamic_inputs() + if not hasattr(self, 'mf_model_config'): + raise RuntimeError('mf_model_config not initialized') + dynamic_hidden_states = Tensor( + shape=[None, None], dtype=self.mf_model_config.compute_dtype) + self.ready_lm_head.set_inputs(dynamic_hidden_states) + + def prepare_inputs(self, input_ids, positions): + return self.prepare_base_inputs(input_ids, positions) + + def update_model_inputs(self, model_inputs, **kwargs): + return model_inputs + + # DLLM + def connector_send_kvcache(self): + logger.debug("reached connector_send_kvcache") + _pynative_executor.sync() + forward_context = get_forward_context() + if not hasattr(self, 'mf_model_config'): + raise RuntimeError('mf_model_config not initialized') + for i in range(self.mf_model_config.num_layers): + kv_cache = self.kv_caches[i] + k_cache = kv_cache.kv_cache[forward_context.virtual_engine][0] + v_cache = kv_cache.kv_cache[forward_context.virtual_engine][1] + maybe_save_kv_layer_to_connector(str(i), (k_cache, v_cache)) + + # DLLM + def connector_wait_for_kv_layer(self): + logger.debug("reached connector_wait_for_kv_layer") + if not hasattr(self, 'mf_model_config'): + raise RuntimeError('mf_model_config not initialized') + for i in range(self.mf_model_config.num_layers): + wait_for_kv_layer_from_connector("key." + str(i)) + + def forward(self, + input_ids: Tensor, + positions: Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[Tensor] = None, + **kwargs) -> Union[Tensor, IntermediateTensors]: + model_inputs, is_prefill = self.prepare_inputs(input_ids, positions) + model_inputs = self.update_model_inputs(model_inputs, **kwargs) + + if is_prefill: + self.network.phase = "prefill" + if not self.set_flags or is_pynative(): + self.network.add_flags_custom(is_first_iteration=True) + hidden_states = self.network(**model_inputs) + self.network.phase = "increment" + if not self.set_flags or is_pynative(): + self.network.add_flags_custom(is_first_iteration=False) + self.set_flags = True + if kv_transfer_supported and is_v1_kv_transfer_group(): + self.connector_send_kvcache() + # DLLM + else: + if kv_transfer_supported: + if is_v1_kv_transfer_group() and self.is_prefill_task(): + self.connector_send_kvcache() + + if is_v1_kv_transfer_group() and self.is_decoder_task(): + self.connector_wait_for_kv_layer() + logger.debug("connector_wait_for_kv_layer success") + hidden_states = self.network(**model_inputs) + + return hidden_states + + def compute_logits( + self, + hidden_states: Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[Tensor]: + if sampling_metadata is not None: + selected_token_indices = sampling_metadata.selected_token_indices + if (selected_token_indices is not None + and selected_token_indices.numel() <= 0): + if not hasattr(self, 'mf_model_config'): + raise RuntimeError('mf_model_config not initialized') + logits = ms.mint.zeros( + (0, self.mf_model_config.vocab_size), + dtype=self.mf_model_config.compute_dtype) + else: + hidden_states = hidden_states.index_select( + 0, selected_token_indices) + logits = self.ready_lm_head(hidden_states) + logits = logits.view(-1, logits.shape[-1]) + else: + logits = self.ready_lm_head(hidden_states) + logits = logits.view(-1, logits.shape[-1]) + return logits + + def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> set[str]: + raise NotImplementedError("load_weight not implemented.") diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py deleted file mode 100644 index 5036323c..00000000 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ /dev/null @@ -1,342 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Copyright 2025 Huawei Technologies Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -transform huggingface safetensor. -""" - -import os -from enum import Enum - -from mindformers.parallel_core.inference import parallel_state as ps -from mindspore.communication.management import get_group_size, get_rank -from safetensors import safe_open -from vllm_mindspore.utils import is_310p - -class EPMethod(Enum): - """ - EP method enums - """ - DEFAULT = 'default' - ALLTOALL = 'alltoall' - ALLGATHER = 'allgather' - - -class BaseWeightProcessor: - r""" - Provide model weight load and shards. - Args: - config (MF Config): The config of Infer model. - network (InferenceModelForCausalLM): The network of infer model. - - """ - - def __init__(self, config, network, is_quant, vllm_config): - self.vllm_config = vllm_config - self.is_310p = is_310p() - self.config = config - self.network = network - self.is_quant = is_quant - self.global_rank_id = get_rank() - self.global_group_size = get_group_size() - self.tp_group_size = ps.get_tensor_model_parallel_world_size() - self.dp_group_size = ps.get_data_parallel_world_size() - self.num_router_experts = self.config.moe_config.expert_num if \ - self.config.moe_config.expert_num else 1 - self.moe_ep_size = ps.get_moe_expert_parallel_world_size() - self.moe_tp_size = ps.get_moe_tensor_parallel_world_size() - self.tp_dp_size = ps.get_tensor_and_data_parallel_world_size() - self.ep_method = EPMethod.DEFAULT - if self.dp_group_size > 1\ - and self.moe_ep_size == self.global_group_size: - self.ep_method = EPMethod.ALLTOALL - elif self.dp_group_size > 1: - self.ep_method = EPMethod.ALLGATHER - self.tp_rank_id = ps.get_tensor_model_parallel_rank() - self.tp_dp_rank_id = ps.get_tensor_and_data_parallel_rank() - - self.ep_group_nums = self.num_router_experts // self.moe_ep_size - self.moe_ep_rank_id = ps.get_moe_expert_parallel_rank() - self.moe_tp_rank_id = ps.get_moe_tensor_parallel_rank() - self.ep_start = self.moe_ep_rank_id * self.ep_group_nums - self.ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums - - self.parameter_dict = {} - self.file_handles = {} - - def get_file_handles(self, filename): - if filename not in self.file_handles: - fp = safe_open(filename, framework="np") - self.file_handles[filename] = fp - return self.file_handles[filename] - - def release_file_handles(self): - del self.file_handles - - def get_safetensor_from_file(self, hf_param_name, src_hf_dir, - hf_weight_map): - safetensor_file = hf_weight_map[hf_param_name] - filename = os.path.join(src_hf_dir, safetensor_file) - sf_file = self.get_file_handles(filename) - qint4 = False - if sf_file.metadata( - ) is not None and hf_param_name in sf_file.metadata(): - qint4 = True - - np_data = sf_file.get_tensor(hf_param_name) - return np_data, qint4 - - def get_safetensor_from_file_split_tp_group(self, - hf_param_name, - src_hf_dir, - hf_weight_map, - split_axis=0): - safetensor_file = hf_weight_map[hf_param_name] - filename = os.path.join(src_hf_dir, safetensor_file) - sf_file = self.get_file_handles(filename) - qint4 = False - if sf_file.metadata( - ) is not None and hf_param_name in sf_file.metadata(): - qint4 = True - - np_data = sf_file.get_slice(hf_param_name) - shape = np_data.get_shape() - if split_axis == 0: - split_size = shape[0] // self.tp_group_size - start = self.tp_rank_id * split_size - stop = (self.tp_rank_id + 1) * split_size - split_data = np_data[start:stop] - elif split_axis == 1: - split_size = shape[1] // self.tp_group_size - start = self.tp_rank_id * split_size - stop = (self.tp_rank_id + 1) * split_size - split_data = np_data[:, start:stop] - elif split_axis == 2: - split_size = shape[2] // self.tp_group_size - start = self.tp_rank_id * split_size - stop = (self.tp_rank_id + 1) * split_size - split_data = np_data[:, :, start:stop] - else: - raise ValueError( - "split_axis:{} is not supported.".format(split_axis)) - return split_data, qint4 - - def get_safetensor_from_file_split_tpdp_group(self, - hf_param_name, - src_hf_dir, - hf_weight_map, - split_axis=0): - safetensor_file = hf_weight_map[hf_param_name] - filename = os.path.join(src_hf_dir, safetensor_file) - sf_file = self.get_file_handles(filename) - qint4 = False - if sf_file.metadata( - ) is not None and hf_param_name in sf_file.metadata(): - qint4 = True - - np_data = sf_file.get_slice(hf_param_name) - shape = np_data.get_shape() - if split_axis == 0: - split_size = shape[0] // self.tp_dp_size - start = self.tp_dp_rank_id * split_size - stop = (self.tp_dp_rank_id + 1) * split_size - split_data = np_data[start:stop] - elif split_axis == 1: - split_size = shape[1] // self.tp_dp_size - start = self.tp_dp_rank_id * split_size - stop = (self.tp_dp_rank_id + 1) * split_size - split_data = np_data[:, start:stop] - elif split_axis == 2: - split_size = shape[2] // self.tp_dp_size - start = self.tp_dp_rank_id * split_size - stop = (self.tp_dp_rank_id + 1) * split_size - split_data = np_data[:, :, start:stop] - else: - raise ValueError( - "split_axis:{} is not supported.".format(split_axis)) - return split_data, qint4 - - def get_safetensor_from_file_split_global_group(self, - hf_param_name, - src_hf_dir, - hf_weight_map, - split_axis=0): - safetensor_file = hf_weight_map[hf_param_name] - filename = os.path.join(src_hf_dir, safetensor_file) - sf_file = self.get_file_handles(filename) - qint4 = False - if sf_file.metadata( - ) is not None and hf_param_name in sf_file.metadata(): - qint4 = True - - np_data = sf_file.get_slice(hf_param_name) - shape = np_data.get_shape() - if split_axis == 0: - split_size = shape[0] // self.global_group_size - start = self.global_rank_id * split_size - stop = (self.global_rank_id + 1) * split_size - split_data = np_data[start:stop] - elif split_axis == 1: - split_size = shape[1] // self.global_group_size - start = self.global_rank_id * split_size - stop = (self.global_rank_id + 1) * split_size - split_data = np_data[:, start:stop] - elif split_axis == 2: - split_size = shape[2] // self.global_group_size - start = self.global_rank_id * split_size - stop = (self.global_rank_id + 1) * split_size - split_data = np_data[:, :, start:stop] - else: - raise ValueError( - "split_axis:{} is not supported.".format(split_axis)) - - return split_data, qint4 - - def get_safetensor_from_file_split_moe_tp_group(self, - hf_param_name, - src_hf_dir, - hf_weight_map, - split_axis=0): - safetensor_file = hf_weight_map[hf_param_name] - filename = os.path.join(src_hf_dir, safetensor_file) - sf_file = self.get_file_handles(filename) - qint4 = False - if sf_file.metadata( - ) is not None and hf_param_name in sf_file.metadata(): - qint4 = True - - np_data = sf_file.get_slice(hf_param_name) - shape = np_data.get_shape() - if split_axis == 0: - split_size = shape[0] // self.moe_tp_size - start = self.moe_tp_rank_id * split_size - stop = (self.moe_tp_rank_id + 1) * split_size - split_data = np_data[start:stop] - elif split_axis == 1: - split_size = shape[1] // self.moe_tp_size - start = self.moe_tp_rank_id * split_size - stop = (self.moe_tp_rank_id + 1) * split_size - split_data = np_data[:, start:stop] - else: - raise ValueError( - "split_axis:{} is not supported.".format(split_axis)) - - return split_data, qint4 - - def get_routed_safetensor_3_dim(self, - hf_param_name, - src_hf_dir, - hf_weight_map, - split_ep=False, - split_tp=False, - tp_axis=-1): - '''get_routed_safetensor_3_dim''' - safetensor_file = hf_weight_map[hf_param_name] - filename = os.path.join(src_hf_dir, safetensor_file) - sf_file = self.get_file_handles(filename) - qint4 = False - if sf_file.metadata( - ) is not None and hf_param_name in sf_file.metadata(): - qint4 = True - if not split_tp and not split_ep: - np_data = sf_file.get_tensor(hf_param_name) - return np_data, qint4 - - np_data = sf_file.get_slice(hf_param_name) - if not split_tp and split_ep: - split_data = np_data[self.ep_start:self.ep_stop, :, :] - return split_data, qint4 - - shape = np_data.get_shape() - if tp_axis == 1: - split_size = shape[1] // self.moe_tp_size - start = self.moe_tp_rank_id * split_size - stop = (self.moe_tp_rank_id + 1) * split_size - split_data = np_data[ - self.ep_start:self.ep_stop, - start:stop, :] if split_ep else np_data[:, start:stop, :] - elif tp_axis == 2: - split_size = shape[2] // self.moe_tp_size - start = self.moe_tp_rank_id * split_size - stop = (self.moe_tp_rank_id + 1) * split_size - split_data = np_data[ - self.ep_start:self.ep_stop, :, - start:stop] if split_ep else np_data[:, :, start:stop] - else: - raise ValueError("tp_axis:{} is not supported.".format(tp_axis)) - return split_data, qint4 - - def get_routed_safetensor_2_dim(self, - hf_param_name, - src_hf_dir, - hf_weight_map, - split_ep=False, - split_tp=False, - tp_axis=-1): - '''get_moe_routed_safetensor_2_dim''' - safetensor_file = hf_weight_map[hf_param_name] - filename = os.path.join(src_hf_dir, safetensor_file) - sf_file = self.get_file_handles(filename) - qint4 = False - if sf_file.metadata( - ) is not None and hf_param_name in sf_file.metadata(): - qint4 = True - if not split_tp and not split_ep: - np_data = sf_file.get_tensor(hf_param_name) - return np_data, qint4 - - np_data = sf_file.get_slice(hf_param_name) - if not split_tp and split_ep: - split_data = np_data[self.ep_start:self.ep_stop, :] - return split_data, qint4 - - shape = np_data.get_shape() - if tp_axis == 1: - split_size = shape[1] // self.moe_tp_size - start = self.moe_tp_rank_id * split_size - stop = (self.moe_tp_rank_id + 1) * split_size - split_data = np_data[ - self.ep_start:self.ep_stop, - start:stop] if split_ep else np_data[:, start:stop] - else: - raise ValueError( - "split_tp is True but tp_axis:{} is not supported.".format( - tp_axis)) - return split_data, qint4 - - def split_weight_by_rank(self, weight, split_axis=0): - if self.tp_group_size == 1: - return weight - - shape = weight.shape - if split_axis == 0: - split_size = shape[0] // self.tp_group_size - start = self.tp_rank_id * split_size - stop = (self.tp_rank_id + 1) * split_size - split_data = weight[start:stop] - elif split_axis == 1: - split_size = shape[1] // self.tp_group_size - start = self.tp_rank_id * split_size - stop = (self.tp_rank_id + 1) * split_size - split_data = weight[:, start:stop] - else: - raise ValueError( - "split_axis:{} is not supported.".format(split_axis)) - return split_data - - def load_safetensors_shard(self, src_hf_dir): - """ load safetensors and shards """ - raise NotImplementedError( - "load_safetensors_shard method is not implemented.") diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index b149e94b..2ce557d6 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -33,7 +33,7 @@ from vllm.sequence import IntermediateTensors from vllm_mindspore.model_executor.models.attention_mask import ( LowerTriangularMask) from vllm_mindspore.model_executor.models.utils import is_use_ringmla -from vllm_mindspore.model_executor.utils import set_model_context +from vllm_mindspore.model_executor.utils import set_model_context, get_model_context from vllm_mindspore.utils import FORMAT_TYPE, STR_DTYPE_TO_MS_DTYPE, is_310p, create_kv_cache from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata @@ -418,7 +418,9 @@ class NativeModel(MsModelBase): # vllm_config.model_config.enforce_eager = True self.is_eager_mode = vllm_config.model_config.enforce_eager self.prefill_graph = None + self.lora_prefill_graph = None self.decode_graph = None + self.lora_decode_graph = None @property def ready_model(self) -> nn.Cell: @@ -561,23 +563,79 @@ class NativeModel(MsModelBase): # graph mode if is_prefill: - self.model.phase = "prefill" - if self.prefill_graph is None: + # self.model.phase = "prefill" + # if self.prefill_graph is None: + # set_model_context("is_prefill", True) + # self.model._set_jit_graph_name("prefill") + # self.set_model_inputs(input_ids, positions, + # intermediate_tensors, inputs_embeds) + # self.prefill_graph = ms.jit(function=self.model, + # jit_level="O0") + # model_output = self.prefill_graph(**model_inputs) + if self.prefill_graph is None or self.lora_prefill_graph is None: set_model_context("is_prefill", True) + if get_model_context("no_lora"): + self.model.phase = "prefill" + self.model._set_jit_graph_name("prefill") + self.set_model_inputs(input_ids, positions, + intermediate_tensors, inputs_embeds) + self.prefill_graph = ms.jit(function=self.model, + jit_level="O0") + else: + self.model.phase = "prefill_lora" + self.model._set_jit_graph_name("prefill_lora") + self.set_model_inputs(input_ids, positions, + intermediate_tensors, inputs_embeds) + self.lora_prefill_graph = ms.jit(function=self.model, + jit_level="O0") + if get_model_context("no_lora"): + self.model.phase = "prefill" self.model._set_jit_graph_name("prefill") self.set_model_inputs(input_ids, positions, - intermediate_tensors, inputs_embeds) - self.prefill_graph = ms.jit(function=self.model, - jit_level="O0") - model_output = self.prefill_graph(**model_inputs) + intermediate_tensors, inputs_embeds) + model_output = self.prefill_graph(**model_inputs) + else: + self.model.phase = "prefill_lora" + self.model._set_jit_graph_name("prefill_lora") + self.set_model_inputs(input_ids, positions, + intermediate_tensors, inputs_embeds) + model_output = self.lora_prefill_graph(**model_inputs) else: - self.model.phase = "increment" - if self.decode_graph is None: + # self.model.phase = "increment" + # if self.decode_graph is None: + # set_model_context("is_prefill", False) + # self.model._set_jit_graph_name("decode") + # self.set_model_inputs(input_ids, positions, + # intermediate_tensors, inputs_embeds) + # self.decode_graph = ms.jit(function=self.model, jit_level="O0") + # model_output = self.decode_graph(**model_inputs) + if self.decode_graph is None or self.lora_decode_graph is None: set_model_context("is_prefill", False) + if get_model_context("no_lora"): + self.model.phase = "increment" + self.model._set_jit_graph_name("decode") + self.set_model_inputs(input_ids, positions, + intermediate_tensors, inputs_embeds) + self.decode_graph = ms.jit(function=self.model, jit_level="O0") + else: + self.model.phase = "increment_lora" + self.model._set_jit_graph_name("decode_lora") + self.set_model_inputs(input_ids, positions, + intermediate_tensors, inputs_embeds) + self.lora_decode_graph = ms.jit(function=self.model, + jit_level="O0") + if get_model_context("no_lora"): + self.model.phase = "increment" self.model._set_jit_graph_name("decode") self.set_model_inputs(input_ids, positions, - intermediate_tensors, inputs_embeds) - self.decode_graph = ms.jit(function=self.model, jit_level="O0") - model_output = self.decode_graph(**model_inputs) + intermediate_tensors, inputs_embeds) + model_output = self.decode_graph(**model_inputs) + else: + self.model.phase = "increment_lora" + self.model._set_jit_graph_name("decode_lora") + self.set_model_inputs(input_ids, positions, + intermediate_tensors, inputs_embeds) + model_output = self.lora_decode_graph(**model_inputs) + return model_output diff --git a/vllm_mindspore/model_executor/utils.py b/vllm_mindspore/model_executor/utils.py index b7187afe..3752ed69 100644 --- a/vllm_mindspore/model_executor/utils.py +++ b/vllm_mindspore/model_executor/utils.py @@ -33,7 +33,8 @@ def set_weight_attrs( setattr(weight, key, value) -_native_model_context = {"is_prefill": True} +_native_model_context = {"is_prefill": True, + "no_lora": True} def set_model_context(key, value): diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py index 1c37be98..ca3199ba 100644 --- a/vllm_mindspore/worker/model_runner.py +++ b/vllm_mindspore/worker/model_runner.py @@ -42,10 +42,18 @@ def _get_cuda_graph_pad_size(self, # No need to use cuda graph for mindspore. return -1 +def profile_run(self) -> None: + max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + self._dummy_run(max_num_batched_tokens, max_num_seqs) + if self.lora_config: + self._dummy_run(max_num_batched_tokens, max_num_seqs, True) def _dummy_run(self, max_num_batched_tokens: int, - max_num_seqs: int = 1) -> None: + max_num_seqs: int = 1, + use_lora: bool = False) -> None: with self.set_in_profile_run(): # Enable top-k sampling to reflect the accurate memory usage. sampling_params = \ @@ -57,7 +65,7 @@ def _dummy_run(self, # passed in, which contains a lora from the lora warmup path. dummy_lora_requests: List[LoRARequest] = [] dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config: + if use_lora: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): for idx in range(self.lora_config.max_loras): @@ -172,7 +180,7 @@ def _dummy_run(self, self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() - if self.lora_config: + if use_lora: # Remove dummy loras. assert self.lora_manager is not None self.remove_all_loras() diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index 81db9bc4..f1d32c94 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -15,14 +15,15 @@ # limitations under the License. """Adapted functions for mindspore in Worker.""" +from typing import List import math import subprocess import os -import subprocess import psutil import torch from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.sampling_params import SamplingParams from vllm.sequence import SequenceGroupMetadata @@ -172,7 +173,8 @@ def _prepare_input_for_warmup(model_config, model_runner, cache_engine, is_prefill, - is_mtp_model=False): + is_mtp_model=False, + use_lora=False): bs = 1 seq_len = model_runner.scheduler_config.max_num_batched_tokens \ if is_prefill else 1 @@ -182,6 +184,20 @@ def _prepare_input_for_warmup(model_config, i for i in range(math.ceil(seq_len / cache_engine.block_size)) ] + dummy_lora_requests: List[LoRARequest] = [] + if use_lora: + assert model_runner.lora_manager is not None + LORA_WARMUP_RANK = 8 + with model_runner.lora_manager.dummy_lora_cache(): + dummy_lora_request = LoRARequest( + lora_name=f"warmup_for_decode", + lora_int_id=1, + lora_path="/not/a/real/path", + ) + model_runner.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests = dummy_lora_request + # adapter multi modal warm up seq_data = dummy_data.seq_data if seq_len == 1: @@ -194,7 +210,7 @@ def _prepare_input_for_warmup(model_config, seq_data={idx: seq_data}, sampling_params=SamplingParams(), block_tables={idx: block_tables_num}, - lora_request=None, + lora_request=dummy_lora_requests if use_lora else None, multi_modal_data=None, multi_modal_placeholders=None, ) for idx in range(bs) @@ -243,6 +259,16 @@ def _warm_up_model(self) -> None: torch.cuda.synchronize() + if self.vllm_config.lora_config is not None: + # warmup for lora prefill + #model_input, _ = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], True, False, True) + #self.model_runner.execute_model(model_input, kv_cache, None) + #torch.cuda.synchronize() + # warmup for lora decode + model_input, _ = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], False, False, True) + self.model_runner.execute_model(model_input, kv_cache, None) + torch.cuda.synchronize() + # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) -- Gitee