From 091e70bf7e1c9dce636020a4df6af22f613d2400 Mon Sep 17 00:00:00 2001
From: lijiaming <1228575330@qq.com>
Date: Thu, 20 Mar 2025 20:03:44 +0800
Subject: [PATCH 1/3] add bind core script

---
 ...50\347\275\262\346\214\207\345\215\227.md" |  12 +-
 .../prepare/files/lib/fine-grainded-bind.py   | 352 ++++++++++++++++++
 .../roles/prepare/files/lib/set_env.sh        |   9 +-
 .../roles/prepare/files/lib/start_docker.sh   |   4 +-
 .../workspace/roles/prepare/files/prepare.sh  |  13 +-
 5 files changed, 376 insertions(+), 14 deletions(-)
 create mode 100644 script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind.py

diff --git "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md"
index e70be46..256540d 100644
--- "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md"
+++ "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md"
@@ -99,14 +99,14 @@ sh mindspore-deepseek/workspace/roles/prepare/files/lib/ascend_prepare.sh
 **Step1：下载oedeploy工具（下载到控制节点）**
 
 ```shell
-# 下载插件包并解压
-wget https://repo.oepkgs.net/openEuler/rpm/openEuler-24.03-LTS/contrib/oedp/plugins/mindspore-deepseek.tar.gz
-
-tar zxvf mindspore-deepseek.tar.gz
 # 下载安装oedp工具，例如:
 wget https://repo.oepkgs.net/openEuler/rpm/openEuler-24.03-LTS/contrib/oedp/aarch64/Packages/oedp-1.0.0-2.oe2503.aarch64.rpm
 
 yum localinstall oedp-1.0.0-2.oe2503.aarch64.rpm
+# 下载插件包
+git clone https://gitee.com/openeuler/llm_solution.git
+
+cd llm_solution/script/mindspore-deepseek
 ```
 
 **Step2：调整oedeploy配置文件**
@@ -353,11 +353,11 @@ npu-smi set -t reset -i $id -c $chip_id
 
 该步骤在宿主机执行，需在所有节点执行
 
-**Step1:** 可使用even-iso.py绑核脚本，进行细粒度绑核提升性能
+**Step1:** 可使用fine-grainded-bind.py绑核脚本，进行细粒度绑核提升性能
 
 ```shell
 # 所有节点执行
-python ./lib/even-iso.py
+python ./lib/fine-grainded-bind.py
 ```
 
 
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind.py b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind.py
new file mode 100644
index 0000000..ef27be7
--- /dev/null
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind.py
@@ -0,0 +1,352 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+Utils for cann workqueue cores
+"""
+
+import os
+import psutil
+import subprocess
+
+def int_to_binary_list(value: int, align_length: int = 4) -> list:
+    """
+    convert int value to binary list
+    e.g. 13 => [1, 1, 0, 1]
+    current only for 0 - 15
+
+    Args:
+        value (`int`):
+            The int value to convert to binary list.
+        align_length (`int`, *optional*, defaults to `4`):
+            The align length for list, it will add 0 for small value
+
+    Returns:
+        The binary list with the value.
+    """
+    bin_list = []
+    divider = value
+    remainder = 0
+    while True:
+        remainder = divider % 2
+        divider = int(divider / 2)
+        bin_list.append(remainder)
+        if divider == 0:
+            break
+
+    while len(bin_list) < align_length:
+        bin_list.append(0)
+
+    bin_list.reverse()
+    return bin_list
+
+
+def binary_list_to_int(bin_list: list) -> int:
+    """
+    convert binary list to int value
+    e.g. [1, 1, 0, 1] => 13
+    current only for 0 - 15
+
+    Args:
+        bin_list (`list`):
+            The binary list represent to int value.
+
+    Returns:
+        The int value.
+    """
+    value = 0
+    muliplier = 1
+    bin_list.reverse()
+    for v in bin_list:
+        value = value + v * muliplier
+        muliplier *= 2
+    return value
+
+
+def string_to_bit_list(array_string: str) -> list:
+    """
+    convert hex string to binary list
+    e.g. "ff" => [1, 1, 1, 1, 1, 1, 1, 1]
+        "deadbeef" => [1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1]
+
+    Args:
+        array_string (`str`):
+            The binary list represent to int value.
+
+    Returns:
+        The binary list for the string.
+    """
+    bin_list = []
+    for c in array_string:
+        bit_list = int_to_binary_list(int(c, 16))
+        bin_list += bit_list
+    bin_list.reverse()
+    return bin_list
+
+
+class BitArray:
+    """
+    The bit array class to solve core mask string.
+
+    Args:
+        length(`int`, *optional*, defaults to `0`):
+            The max bit length of the array.
+    """
+
+    def __init__(self, length: int = 0):
+        self.bits = [0 for _ in range(length)]
+
+    def load_from_str(self, array_string: str):
+        """
+        load bit array from hex string
+
+        Args:
+            array_string (`str`):
+                The binary list represent to int value.
+
+        Returns:
+            NA.
+        """
+        self.bits = string_to_bit_list(array_string)
+
+    def get_marked_index(self) -> list:
+        """
+        get the index list with value 1
+
+        Args:
+            NA.
+
+        Returns:
+            The index list.
+        """
+        marked_index_list = []
+        for idx, item in enumerate(self.bits):
+            if item == 1:
+                marked_index_list.append(idx)
+        return marked_index_list
+
+    def to_bytes_array(self) -> list:
+        """
+        convert the bit array to byte array which is 8-bit elements
+
+        Args:
+            NA.
+
+        Returns:
+            The array values with bytes.
+        """
+        bytes_array = []
+        slide_window_list = []
+        self.bits.reverse()
+        for idx, item in enumerate(self.bits):
+            slide_window_list.append(item)
+            if (idx + 1) % 8 == 0:
+                value = binary_list_to_int(slide_window_list)
+                slide_window_list.clear()
+                bytes_array.append(value)
+        self.bits.reverse()
+        return bytes_array
+
+    def __setitem__(self, index: int, value: int):
+        """
+        set the bit value with index
+
+        Args:
+            index (`int`):
+                The index to set value.
+            value (`int`):
+                The value to set.
+
+        Returns:
+            NA.
+        """
+        self.bits[index] = value
+
+    def __getitem__(self, index: int) -> int:
+        """
+        get the bit value with index
+
+        Args:
+            index (`int`):
+                The index to get value.
+
+        Returns:
+            The value to get.
+        """
+        return self.bits[index]
+
+
+
+def get_cann_workqueue_cores(device_id: int) -> list:
+    """
+    get cann workqueue binding cores list
+    for most system, the config is set on path:
+    /sys/devices/virtual/workqueue/dev0_sq_send_wq/cpumask
+
+    Args:
+        device_id (`int`):
+            The device_id for the workqueue, most time is related to rank_ik.
+
+    Returns:
+        The marked core index list.
+    """
+    cann_workqueue_config_path = f"/sys/devices/virtual/workqueue/dev{device_id}_sq_send_wq/cpumask"
+    if not os.path.exists(cann_workqueue_config_path):
+        # no this config, return [] to disable cann binding
+        return []
+
+    f = open(cann_workqueue_config_path)
+    cann_config = f.read()
+    cann_config = cann_config.replace(",", "")
+    cann_config = cann_config.replace("\n", "")
+    mask_array = BitArray()
+    mask_array.load_from_str(cann_config)
+    return mask_array.get_marked_index()
+
+
+def mask_to_str(mask: BitArray) -> str:
+    """
+    convert BitArray mask to string format with workqueue config
+
+    Args:
+        mask (`BitArray`):
+            The BitArray mask to convert to string.
+
+    Returns:
+        The string followed with cann workqueue format to config.
+    """
+    mask_bytes = mask.to_bytes_array()
+    mask_str = ""
+    separete_num = 4
+    i = 0
+    for mask_value in mask_bytes:
+        mask_str += '{:02x}'.format(mask_value)
+        i += 1
+        if i % separete_num == 0:
+            mask_str += ","
+    mask_str = mask_str[:-1]
+    return mask_str
+
+
+def execute_cmd(cmd: str, fake: bool ):
+    """
+    execute shell command
+
+    Args:
+        cmd (`str`):
+            The command need to execute.
+        fake (`bool`, *optional*, defaults to `False`):
+            If fake execute is True, then print command instead to execute.
+
+    Returns:
+        NA.
+    """
+    if fake:
+        print(cmd)
+        return
+    sub_process = subprocess.Popen(cmd, shell=True)
+    ret = sub_process.wait()
+    if ret != 0:
+        raise SystemError(f"Execute cmd({cmd}) failed!")
+
+def execute_command(cmd_list):
+    try:
+        with subprocess.Popen(cmd_list, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
+            out, _ = p.communicate(timeout=1000)
+        res = out.decode()
+        return res
+    except FileNotFoundError as e:
+        raise RuntimeError(f"Failed to execute command, because {e}.")
+
+def get_numa_map():
+    numa_topo_out = execute_command(["npu-smi", "info", "-t", "topo"]).strip().split("\n")
+
+    line_no = 0
+    npu_no = 0
+    numa_to_npu_map = {}
+    
+    for val in numa_topo_out:
+        line_no += 1
+        line = ''.join(val.split())
+        if line.startswith("NPU") and line_no > 1:
+            cpu_range = line[33:]
+            if numa_to_npu_map.get(cpu_range, None) is None:
+                numa_to_npu_map[cpu_range] = list()
+            numa_to_npu_map[cpu_range].append(npu_no)
+            npu_no += 1
+
+    npu_to_core_map = {}
+    for key, val in numa_to_npu_map.items():
+        cpu_range = key.split("-")
+        cpu_start = int(cpu_range[0])
+        cpu_end = int(cpu_range[1])
+        #total_core_num = cpu_end - cpu_start + 1
+        #shared_npu_num = len(val)
+        #core_num_per_npu = int(total_core_num / shared_npu_num)
+        core_num_per_npu = cpu_end - cpu_start + 1
+        core_start = cpu_start
+        for npu in val:
+            npu_to_core_map[npu] = core_start + core_num_per_npu - 1
+            core_start += core_num_per_npu
+
+    return npu_to_core_map
+
+
+def binding_cann_workqueue(device_num: int, core_num_per_workqueue: int, separate_device_cores: bool):
+    """
+    binding cann workqueue cores
+
+    Args:
+        device_num (`int`):
+            The total device number on the server.
+        core_num_per_workqueue (`int`):
+            The core number for each workqueue, the core index will alloc from end core index for each device.
+        separate_device_cores (`int`):
+            If separate device cores, each device workqueue binding itself cores,
+            otherwise, all device workqueu binding to same cores.
+
+    Returns:
+        NA.
+    """
+    print(f"the cann workqueue config command list in the follow, please execute the cmd by root user!")
+    total_core_num = psutil.cpu_count(logical=True)
+    core_num_per_device = int(total_core_num / device_num)
+
+    device_core_mask = BitArray(total_core_num)
+    end_core_map = get_numa_map()
+    for i in range(device_num):
+        cann_workqueue_config_path = f"/sys/devices/virtual/workqueue/dev{i}_sq_send_wq/cpumask"
+        mask = BitArray(total_core_num)
+        #start_core_num = i * core_num_per_device
+        end_core_num = end_core_map[i]  #start_core_num + core_num_per_device - 1
+        for j in range(core_num_per_workqueue):
+            core_index = end_core_num - j
+            mask[core_index] = 1
+            device_core_mask[core_index] = 1
+
+        if separate_device_cores:
+            mask_str = mask_to_str(mask)
+            bind_cann_core_cmd = f"echo \"{mask_str}\" > {cann_workqueue_config_path}"
+            execute_cmd(bind_cann_core_cmd, False)
+
+    if not separate_device_cores:
+        device_core_mask_str = mask_to_str(device_core_mask)
+
+        for i in range(device_num):
+            cann_workqueue_config_path = f"/sys/devices/virtual/workqueue/dev{i}_sq_send_wq/cpumask"
+            bind_cann_core_cmd = f"echo \"{device_core_mask_str}\" > {cann_workqueue_config_path}"
+            execute_cmd(bind_cann_core_cmd)
+
+binding_cann_workqueue(8, 4, True)
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
index e43b877..85fca8b 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
@@ -20,6 +20,7 @@ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export ASCEND_TOTAL_MEMORY_GB=64
 export HCCL_CONNECT_TIMEOUT=7200
 export MS_COMPILER_CACHE_ENABLE=1
+export CPU_AFFINITY=0
 '
 
 NET_ENV="
@@ -29,17 +30,15 @@ export HCCL_SOCKET_IFNAME=$RAY_DEVICE
 "
 
 if [ $NODE_NUM -eq 2 ]; then
-	YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
+	YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
 elif [ $NODE_NUM -eq 4 ]; then
-	YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml'
+	YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml'
 fi
 
 
 # 修改权重类型
 sed -e 's/^load_ckpt_format.*/load_ckpt_format: "'$MODEL_TYPE'"/' -i $YAML_FILE
-if [ "$MODEL_TYPE" = "ckpt" ]; then
-    sed -e 's/^auto_trans_ckpt.*/auto_trans_ckpt: False/' -i $YAML_FILE
-fi
+sed -e 's/^auto_trans_ckpt.*/auto_trans_ckpt: False/' -i $YAML_FILE
 
 YAML_ENV="export MINDFORMERS_MODEL_CONFIG=$YAML_FILE"
 
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh
index 5e97f7f..0c7c508 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh
@@ -25,7 +25,7 @@ if [ $IS_STOP_OTHER_CONTAINER -ne 0 ]; then
 fi
 
 # 如果存在名称相同的容器，则直接使用
-docker ps -a | grep $IMAGE_NAME:$IMAGE_TAG | grep $CONTAINER_NAME
+docker ps -a | grep $IMAGE_NAME:$IMAGE_TAG | grep -w $CONTAINER_NAME
 if [ $? -eq 0 ]; then
     echo "发现容器 $CONTAINER_NAME 已存在，直接使用"
     docker start $CONTAINER_NAME
@@ -33,7 +33,7 @@ if [ $? -eq 0 ]; then
 fi
 
 # 如果存在名称相同，但镜像不同容器，则报错
-docker ps -a | grep $CONTAINER_NAME
+docker ps -a | grep -w $CONTAINER_NAME
 if [ $? -eq 0 ]; then
     echo "发现容器名称 $CONTAINER_NAME 已被使用，请排查"
     exit 1
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
index c000112..2504c2b 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
@@ -43,7 +43,7 @@ main() {
     fi
 
     # 检测需要部署的节点ip数量
-    if [ [ $NODE_NUM -ne 2 ] && [ $NODE_NUM -ne 4 ] ]; then
+    if [ $NODE_NUM -ne 2 ] && [ $NODE_NUM -ne 4 ]; then
         echo "当前仅支持两/四节点部署,当前数量是$NODE_NUM"
         exit 1
     fi
@@ -59,6 +59,17 @@ main() {
     # 3. 设置容器内环境变量
     docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh
 
+    # 4. 进行绑核
+    pip list | grep psutil
+    if [ $? -ne 0 ]; then
+        pip install psutil
+    fi
+    python $current_path/lib/fine-grainded-bind.py
+    if [ $? -ne 0 ]; then
+        echo "细粒度线程绑核失败，请确保驱动版本>=24.1.0"
+        exit 1
+    fi
+
 }
 
 # 执行主函数
-- 
Gitee


From 8334ac2eea177f0bef0c6635eeebef75e45fb025 Mon Sep 17 00:00:00 2001
From: lijiaming <1228575330@qq.com>
Date: Wed, 26 Mar 2025 21:42:04 +0800
Subject: [PATCH 2/3] support int4

---
 .../roles/prepare/files/lib/ray_start.sh      |  4 ++
 .../roles/prepare/files/lib/set_env.sh        | 21 +++++---
 .../roles/prepare/files/lib/start_ds.sh       | 51 +++++++++++--------
 .../workspace/roles/prepare/files/prepare.sh  | 40 ++-------------
 4 files changed, 53 insertions(+), 63 deletions(-)

diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
index 826eb12..c4d243a 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
@@ -8,6 +8,10 @@ source $current_path/config.cfg
 source $ENV_FILE
 ray_start() {
     ps -ef | grep "python" | grep -v grep | awk '{print $2}' | xargs kill
+    if [ $NODE_NUM -eq 1 ]; then
+        echo "单机部署无需启动ray"
+        return
+    fi
     ray stop
 
     if [ "$1" ]; then
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
index 85fca8b..076198a 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
@@ -29,13 +29,21 @@ export TP_SOCKET_IFNAME=$RAY_DEVICE
 export HCCL_SOCKET_IFNAME=$RAY_DEVICE
 "
 
-if [ $NODE_NUM -eq 2 ]; then
-	YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
+if [ $NODE_NUM -eq 1 ]; then
+    YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
+    cat $YAML_FILE | grep gptq-pergroup
+    if [ $? -ne 0 ]; then
+        sed -e 's/model_parallel:.*/model_parallel: 8/' -i $YAML_FILE
+        sed -e "s/quant_method:.*/quant_method: 'gptq-pergroup'/" -i $YAML_FILE
+        sed -e 's/weight_dtype/#weight_dtype/' -i $YAML_FILE
+        sed -e 's/activation_dtype/#activation_dtype/' -i $YAML_FILE
+    fi
+elif [ $NODE_NUM -eq 2 ]; then
+    YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
 elif [ $NODE_NUM -eq 4 ]; then
-	YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml'
+    YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml'
 fi
 
-
 # 修改权重类型
 sed -e 's/^load_ckpt_format.*/load_ckpt_format: "'$MODEL_TYPE'"/' -i $YAML_FILE
 sed -e 's/^auto_trans_ckpt.*/auto_trans_ckpt: False/' -i $YAML_FILE
@@ -50,7 +58,8 @@ if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then
 fi
 
 echo "$ENV_ARG" >> $ENV_FILE
-echo "$NET_ENV" >> $ENV_FILE
 echo "$YAML_ENV" >> $ENV_FILE
+if [ $NODE_NUM -ne 1 ]; then
+    echo "$NET_ENV" >> $ENV_FILE
+fi
 source $ENV_FILE
-
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
index 0a637ae..ca7089a 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
@@ -8,33 +8,40 @@ source $current_path/config.cfg
 source $ENV_FILE
 # 仅主节点运行
 
-if [ $NODE_NUM -eq 2 ]; then
-	NPU_NUM=16.0
-    PARALLEL=16
-elif [ $NODE_NUM -eq 4 ]; then
-	NPU_NUM=32.0
-    PARALLEL=32
-fi
-
-ray_status=0
-for i in {1..10}; do
-    ray status | grep "$NPU_NUM NPU"
-    if [ $? -eq 0 ]; then
-        echo "ray集群已全部拉起"
-        ray_status=1
-        break
+if [ $NODE_NUM -ne 1 ]; then
+    if [ $NODE_NUM -eq 2 ]; then
+        NPU_NUM=16.0
+        PARALLEL=16
+    elif [ $NODE_NUM -eq 4 ]; then
+        NPU_NUM=32.0
+        PARALLEL=32
     fi
-    sleep 3
-done
 
-if [ $ray_status -eq 0 ]; then
-    echo "ray集群超时"
-    exit 1
+    ray_status=0
+    for i in {1..10}; do
+        ray status | grep "$NPU_NUM NPU"
+        if [ $? -eq 0 ]; then
+            echo "ray集群已全部拉起"
+            ray_status=1
+            break
+        fi
+        sleep 3
+    done
+
+    if [ $ray_status -eq 0 ]; then
+        echo "ray集群超时"
+        exit 1
+    fi
 fi
 
 #拉起服务
 rm -rf ds.log
-nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --distributed-executor-backend=ray &> ds.log &
+if [ $NODE_NUM -ne 1 ]; then
+    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --distributed-executor-backend=ray &> ds.log &
+else
+    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 &> ds.log &
+fi
+
 #检测推理服务是否拉起
 llm_status=0
 for i in {1..7200}; do
@@ -50,4 +57,4 @@ done
 if [ $llm_status -eq 0 ]; then
     echo "推理服务拉起超时，请手动确认"
     exit 1
-fi
\ No newline at end of file
+fi
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
index 2504c2b..8551ec6 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
@@ -16,55 +16,25 @@ main() {
     systemctl stop firewalld
     systemctl stop iptables
 
-    # 检查防火墙是否启动，如果启动则检查端口是否在防火墙白名单中，如果不存在则添加到白名单中
-    status=$(systemctl status firewalld | grep -E "Active" | awk -F":" '{print $2}' | awk -F" " '{print $1}')
-    if [[ "${status}" == "active" ]]; then
-        # ray 端口防火墙检查
-        port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp)
-        if [[ "${port_ray}" == "no" ]]; then
-            port_ray=$(firewall-cmd --zone=public --add-port=$RAY_PORT/tcp --permanent)
-            firewall-cmd --reload
-        fi
-        port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp)
-        if [[ "${port_ray}" != "yes" ]]; then
-            echo -e "防火墙开启 $RAY_PORT端口失败"
-            exit 1
-        fi
-        port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp)
-        if [[ "${port_llm}" == "no" ]]; then
-            port_llm=$(firewall-cmd --zone=public --add-port=$LLM_PORT/tcp --permanent)
-            firewall-cmd --reload
-        fi
-        port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp)
-        if [[ "${port_llm}" != "yes" ]]; then
-            echo -e "防火墙开启 $LLM_PORT端口失败"
-            exit 1
-        fi
-    fi
-
-    # 检测需要部署的节点ip数量
-    if [ $NODE_NUM -ne 2 ] && [ $NODE_NUM -ne 4 ]; then
-        echo "当前仅支持两/四节点部署,当前数量是$NODE_NUM"
-        exit 1
-    fi
-
     # 1. 启动Docker容器并复制文件
     $current_path/lib/start_docker.sh
     cp_into_container
 
     # 2. 执行组网检查
-    $current_path/lib/net_check.sh
+    if [ $NODE_NUM -ne 1 ]; then
+        $current_path/lib/net_check.sh
+    fi
 
     #进入容器执行
     # 3. 设置容器内环境变量
     docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh
 
     # 4. 进行绑核
-    pip list | grep psutil
+    pip show psutil
     if [ $? -ne 0 ]; then
         pip install psutil
     fi
-    python $current_path/lib/fine-grainded-bind.py
+    python3 $current_path/lib/fine-grainded-bind.py
     if [ $? -ne 0 ]; then
         echo "细粒度线程绑核失败，请确保驱动版本>=24.1.0"
         exit 1
-- 
Gitee


From ea9bf2ef4edd8482a776cada27dd0fe06f20e4b1 Mon Sep 17 00:00:00 2001
From: lijiaming <1228575330@qq.com>
Date: Sat, 29 Mar 2025 19:19:25 +0800
Subject: [PATCH 3/3] add ray bind-core script

---
 ...50\347\275\262\346\214\207\345\215\227.md" |  4 +-
 ...ded-bind.py => fine-grainded-bind-cann.py} | 44 +++++++++++++++----
 .../roles/prepare/files/lib/start_docker.sh   |  1 +
 .../roles/prepare/files/lib/start_ds.sh       |  4 +-
 .../workspace/roles/prepare/files/prepare.sh  |  8 ++--
 5 files changed, 43 insertions(+), 18 deletions(-)
 rename script/mindspore-deepseek/workspace/roles/prepare/files/lib/{fine-grainded-bind.py => fine-grainded-bind-cann.py} (85%)

diff --git "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md"
index 256540d..e18c466 100644
--- "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md"
+++ "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md"
@@ -353,11 +353,11 @@ npu-smi set -t reset -i $id -c $chip_id
 
 该步骤在宿主机执行，需在所有节点执行
 
-**Step1:** 可使用fine-grainded-bind.py绑核脚本，进行细粒度绑核提升性能
+**Step1:** 可用绑核脚本，进行细粒度绑核提升性能
 
 ```shell
 # 所有节点执行
-python ./lib/fine-grainded-bind.py
+python ./lib/fine-grainded-bind-cann.py
 ```
 
 
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind.py b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind-cann.py
similarity index 85%
rename from script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind.py
rename to script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind-cann.py
index ef27be7..1986b6a 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind.py
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/fine-grainded-bind-cann.py
@@ -270,40 +270,66 @@ def execute_command(cmd_list):
     except FileNotFoundError as e:
         raise RuntimeError(f"Failed to execute command, because {e}.")
 
-def get_numa_map():
+def get_numa_map(affinity: bool, core_num_per_workqueue: int):
     numa_topo_out = execute_command(["npu-smi", "info", "-t", "topo"]).strip().split("\n")
 
     line_no = 0
     npu_no = 0
     numa_to_npu_map = {}
-    
+    numa_number = 0
+    max_cpu = 0
+
+    numa_node = execute_command("lscpu").strip().split("\n")
+    for val in numa_node:
+        if val.startswith("CPU(s):"):
+            max_cpu = int(val.split(" ")[-1]) - 1
+        if val.startswith("NUMA"):
+            nodes = val.split(" ")
+            numa_number = int(nodes[-1])
+            break
+
+    npu_max_cpu = False
+    npu_max_cpu_no = 0
     for val in numa_topo_out:
         line_no += 1
         line = ''.join(val.split())
         if line.startswith("NPU") and line_no > 1:
             cpu_range = line[33:]
+            npu_max_cpu_no = max(npu_max_cpu_no, int(cpu_range.split("-")[1]))
             if numa_to_npu_map.get(cpu_range, None) is None:
                 numa_to_npu_map[cpu_range] = list()
             numa_to_npu_map[cpu_range].append(npu_no)
             npu_no += 1
 
+    npu_max_cpu = True if npu_max_cpu_no==max_cpu else False
+    print(len(numa_to_npu_map), npu_no, numa_number, max_cpu, npu_max_cpu_no, npu_max_cpu)
+    shared_mode = False
+    if npu_no > numa_number:
+        shared_mode = True
+        print("Shared mode")
+
     npu_to_core_map = {}
     for key, val in numa_to_npu_map.items():
         cpu_range = key.split("-")
         cpu_start = int(cpu_range[0])
         cpu_end = int(cpu_range[1])
-        #total_core_num = cpu_end - cpu_start + 1
-        #shared_npu_num = len(val)
-        #core_num_per_npu = int(total_core_num / shared_npu_num)
-        core_num_per_npu = cpu_end - cpu_start + 1
+        if shared_mode:
+            total_core_num = cpu_end - cpu_start + 1
+            shared_npu_num = len(val)
+            core_num_per_npu = int(total_core_num / shared_npu_num)
+        else:
+            core_num_per_npu = cpu_end - cpu_start + 1 if npu_max_cpu==False else -(cpu_end - cpu_start + 1)
         core_start = cpu_start
         for npu in val:
             npu_to_core_map[npu] = core_start + core_num_per_npu - 1
-            core_start += core_num_per_npu
+            if affinity == False:
+                core_start += core_num_per_npu
+            else:
+                core_start -= core_num_per_workqueue
+                #core_start -= core_num_per_npu//2
 
     return npu_to_core_map
 
-
 def binding_cann_workqueue(device_num: int, core_num_per_workqueue: int, separate_device_cores: bool):
     """
     binding cann workqueue cores
@@ -325,7 +351,7 @@ def binding_cann_workqueue(device_num: int, core_num_per_workqueue: int, separat
     core_num_per_device = int(total_core_num / device_num)
 
     device_core_mask = BitArray(total_core_num)
-    end_core_map = get_numa_map()
+    end_core_map = get_numa_map(True, core_num_per_workqueue)
     for i in range(device_num):
         cann_workqueue_config_path = f"/sys/devices/virtual/workqueue/dev{i}_sq_send_wq/cpumask"
         mask = BitArray(total_core_num)
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh
index 0c7c508..cd1421c 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh
@@ -8,6 +8,7 @@ current_path=$(
 source $current_path/config.cfg
 # 安装docker
 yum install docker -y
+systemctl restart docker
 
 # 检测镜像是否已被拉取
 docker images | grep $IMAGE_NAME | grep $IMAGE_TAG
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
index ca7089a..49a7bd2 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
@@ -37,9 +37,9 @@ fi
 #拉起服务
 rm -rf ds.log
 if [ $NODE_NUM -ne 1 ]; then
-    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --distributed-executor-backend=ray &> ds.log &
+    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=128 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc --distributed-executor-backend=ray &> ds.log &
 else
-    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 &> ds.log &
+    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=128 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc &> ds.log &
 fi
 
 #检测推理服务是否拉起
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
index 8551ec6..bf01352 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
@@ -30,11 +30,9 @@ main() {
     docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh
 
     # 4. 进行绑核
-    pip show psutil
-    if [ $? -ne 0 ]; then
-        pip install psutil
-    fi
-    python3 $current_path/lib/fine-grainded-bind.py
+    echo 3 > /proc/sys/vm/drop_caches
+    pip install psutil
+    python3 $current_path/lib/fine-grainded-bind-cann.py
     if [ $? -ne 0 ]; then
         echo "细粒度线程绑核失败，请确保驱动版本>=24.1.0"
         exit 1
-- 
Gitee