From b0b6e44099fac4f8d7eebb96c009d4ec8e9508b4 Mon Sep 17 00:00:00 2001 From: one_east Date: Mon, 27 Jan 2025 12:31:17 +0800 Subject: [PATCH] load checkpoint from huggingface --- vllm_mindspore/__init__.py | 12 ++---- .../model_executor/model_loader/__init__.py | 32 ---------------- vllm_mindspore/utils.py | 4 -- vllm_mindspore/worker/cache_engine.py | 7 +--- vllm_mindspore/worker/worker.py | 38 ------------------- 5 files changed, 5 insertions(+), 88 deletions(-) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 0d3627e81..88d87e9be 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -51,14 +51,12 @@ from vllm_mindspore.utils import ( memory_profiling, make_tensor_with_pad, async_tensor_h2d, - get_dtype_size, ) vllm.utils.direct_register_custom_op = direct_register_custom_op vllm.utils.memory_profiling = memory_profiling vllm.utils.make_tensor_with_pad = make_tensor_with_pad vllm.utils.async_tensor_h2d = async_tensor_h2d -vllm.utils.get_dtype_size = get_dtype_size from vllm_mindspore.model_executor.models.registry import ( MindSporeModelRegistry, @@ -80,11 +78,6 @@ vllm.model_executor.model_loader.loader.get_model_architecture = ( ) vllm.model_executor.models.registry._run_in_subprocess = _run_in_subprocess -from vllm_mindspore.model_executor.model_loader import get_ms_model_loader, get_ms_model - -vllm.model_executor.model_loader.get_model_loader = get_ms_model_loader -vllm.model_executor.model_loader.get_model = get_ms_model - from vllm_mindspore.model_executor.sampling_metadata import ( SequenceGroupToSample, SamplingMetadataCache, @@ -155,16 +148,17 @@ vllm.distributed.ensure_kv_transfer_initialized = ensure_kv_transfer_initialized vllm.distributed.model_parallel_is_initialized = model_parallel_is_initialized vllm.distributed.ensure_model_parallel_initialized = ensure_model_parallel_initialized +from vllm_mindspore.model_executor.model_loader import safetensors_weights_iterator +vllm.model_executor.model_loader.loader.safetensors_weights_iterator = safetensors_weights_iterator + from vllm_mindspore.worker.worker import ( _warm_up_model, determine_num_available_blocks, - prepare_worker_input, ) from vllm.worker.worker import Worker Worker._warm_up_model = _warm_up_model Worker.determine_num_available_blocks = determine_num_available_blocks -Worker.prepare_worker_input = prepare_worker_input vllm.worker.worker_base.get_pp_group = get_pp_group from vllm_mindspore.worker.model_runner import _get_cuda_graph_pad_size diff --git a/vllm_mindspore/model_executor/model_loader/__init__.py b/vllm_mindspore/model_executor/model_loader/__init__.py index 328f68b23..8e303d706 100644 --- a/vllm_mindspore/model_executor/model_loader/__init__.py +++ b/vllm_mindspore/model_executor/model_loader/__init__.py @@ -177,35 +177,3 @@ class MsModelLoader(BaseModelLoader): with device_loading_context(module, target_device): quant_method.process_weights_after_loading(module) return model.eval() - - -def get_ms_model_loader(load_config: LoadConfig) -> BaseModelLoader: - """Get a model loader based on the load format.""" - - if isinstance(load_config.load_format, type): - return load_config.load_format(load_config) - - if load_config.load_format == LoadFormat.DUMMY: - raise RuntimeError("Donot support for mindspore model now!") - - if load_config.load_format == LoadFormat.TENSORIZER: - raise RuntimeError("Donot support for mindspore model now!") - - if load_config.load_format == LoadFormat.SHARDED_STATE: - raise RuntimeError("Donot support for mindspore model now!") - - if load_config.load_format == LoadFormat.BITSANDBYTES: - raise RuntimeError("Donot support for mindspore model now!") - - if load_config.load_format == LoadFormat.GGUF: - raise RuntimeError("Donot support for mindspore model now!") - - if load_config.load_format == LoadFormat.RUNAI_STREAMER: - raise RuntimeError("Donot support for mindspore model now!") - - return MsModelLoader(load_config) - - -def get_ms_model(*, vllm_config: VllmConfig) -> nn.Module: - loader = get_ms_model_loader(vllm_config.load_config) - return loader.load_model(vllm_config=vllm_config) diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index 0530d826d..94793cedb 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -176,7 +176,3 @@ def async_tensor_h2d( t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="CPU") return t - -def get_dtype_size(dtype: torch.dtype) -> int: - """Get the size of the data type in bytes.""" - return torch.tensor([1], dtype=dtype).itemsize diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py index 469c3daa2..a3b7c77f6 100644 --- a/vllm_mindspore/worker/cache_engine.py +++ b/vllm_mindspore/worker/cache_engine.py @@ -39,11 +39,8 @@ STR_DTYPE_TO_MS_DTYPE = { def create_block(shape, dtype, name=None, device=None): - # from mindspore.ops.function.array_func import empty as empty_tensor - # blocks = empty_tensor(shape, dtype=dtype, device=device) - blocks = ms.Parameter( - ms.Tensor(shape=shape, dtype=dtype, init=Zero()), name=name, requires_grad=False - ) + from mindspore.ops.function.array_func import empty as empty_tensor + blocks = empty_tensor(shape, dtype=dtype, device=device) return blocks diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index 36ac4c337..1a8655de1 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -45,41 +45,3 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # TODO(tronzhang): use env latter... return 256, 512 - -def prepare_worker_input(self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - virtual_engine = execute_model_req.virtual_engine - num_steps = execute_model_req.num_steps - num_seq_groups = len(execute_model_req.seq_group_metadata_list) - # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. - # they contain parameters to launch cudamemcpyasync. - if execute_model_req.blocks_to_swap_in: - blocks_to_swap_in = torch.tensor( - execute_model_req.blocks_to_swap_in, dtype=torch.int64 - ).view(-1, 2) - else: - blocks_to_swap_in = _create_empty_tensor(torch.int64) - - if execute_model_req.blocks_to_swap_out: - blocks_to_swap_out = torch.tensor( - execute_model_req.blocks_to_swap_out, dtype=torch.int64 - ).view(-1, 2) - else: - blocks_to_swap_out = _create_empty_tensor(torch.int64) - # `blocks_to_copy` is a gpu tensor. The src and tgt of - # blocks to copy are in the same device, and `blocks_to_copy` - # can be used directly within cuda kernels. - if execute_model_req.blocks_to_copy: - blocks_to_copy = torch.tensor( - execute_model_req.blocks_to_copy, dtype=torch.int64 - ).view(-1, 2) - else: - blocks_to_copy = _create_empty_tensor(torch.int64) - - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - num_steps=num_steps, - ) -- Gitee