From 3b0c3806e4a0122e960dd46823cd91d460c8a35b Mon Sep 17 00:00:00 2001 From: one_east Date: Tue, 30 Sep 2025 17:19:08 +0800 Subject: [PATCH] add exception handling to prevent the main process from crashing --- vllm_mindspore/worker/worker.py | 44 +++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index 66afa92c..ebbb5086 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -18,6 +18,8 @@ import math import os import subprocess +import sys +import traceback import psutil import torch @@ -33,16 +35,20 @@ logger = init_logger(__name__) def execute_command(cmd_list): try: - with subprocess.Popen(cmd_list, - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) as p: - out, _ = p.communicate(timeout=1000) - res = out.decode() - return res - except FileNotFoundError as e: - message = f"Failed to execute command, because {e}." - raise RuntimeError(message) from e + result = subprocess.run(cmd_list, + shell=False, + stdout=subprocess.PIPE, + streer=subprocess.PIPE, + timeout=60, + check=False) + return result.stdout.decode() + except FileNotFoundError: + cmd = ' '.join(cmd_list) + logger.warning("Bind CPU command not found: %s", cmd) + except subprocess.TimeoutExpired as e: + logger.warning("Bind CPU command execution timed out: %s", e) + except Exception as e: + logger.warning("Bind CPU command execution failed: %s", e) def get_numa_map(): @@ -122,7 +128,7 @@ def get_numa_map(): else: cpu_start = cpu_start + cpu_count shared_npu_count = len(val) - cpu_num_per_npu = int(cpu_count / shared_npu_count) + cpu_num_per_npu = cpu_count // shared_npu_count for npu in val: npu_to_core_map[npu] = [ cpu_start, cpu_start + cpu_num_per_npu - cpu_reserved_for_cann @@ -154,8 +160,20 @@ def wrapper_worker_bind_cpu(fun): def new_fun(*arg, **kwargs): if not is_310p(): # Bind CPU with wrapper when workers are initializing. - local_rank = kwargs.get("local_rank") - bind_cpu(local_rank) + try: + local_rank = kwargs.get("local_rank") + bind_cpu(local_rank) + except Exception: + exc_type, exc_value, exc_traceback = sys.exc_info() + tb_list = traceback.extract_tb(exc_traceback) + for frame in tb_list[-3:]: + logger.warning("File \"%s\", line %s, in %s", + frame.filename, frame.lineno, frame.name) + if frame.line: + logger.warning(" %s", frame.line.strip()) + logger.warning("%s: %s", exc_type.__name__, exc_value) + logger.warning("Bind CPU to workers failed, please check the " + "stack trace above for the root cause") fun(*arg, **kwargs) return new_fun -- Gitee