diff --git a/omni/adaptors/vllm/patches/pangu_patch.py b/omni/adaptors/vllm/patches/pangu_patch.py index 4a9cbac0b245a3f119d0948c289e4b3c7fcdfd82..802785ef5fb588704be69bfc53e7d2604ffbe50e 100644 --- a/omni/adaptors/vllm/patches/pangu_patch.py +++ b/omni/adaptors/vllm/patches/pangu_patch.py @@ -1,5 +1,7 @@ from omni.adaptors.vllm.utils import get_attr_by_names +from vllm.logger import init_logger +logger = init_logger(__name__) # The following patches are corresponding to vllm-0.9.0 def patch_pangu(): from vllm.config import ModelConfig @@ -212,7 +214,7 @@ def patch_pangu(): "deepseek_mtp"): self.method = "deepseek_mtp" if self.num_speculative_tokens > 1: - print( + logger.info( "All Deepseek MTP models only have " \ "one layer. Might need some code changes " \ "to support multiple layers." @@ -221,7 +223,7 @@ def patch_pangu(): "qwen3_mtp"): self.method = "qwen3_mtp" if self.num_speculative_tokens > 1: - print( + logger.info( "All Qwen3 MTP models only have " \ "one layer. Might need some code changes " \ "to support multiple layers." @@ -230,7 +232,7 @@ def patch_pangu(): "pangu_ultra_moe_mtp"): self.method = "pangu_ultra_moe_mtp" if self.num_speculative_tokens > 1: - print( + logger.info( "All Pangu Ultra MoE MTP models only have " \ "one layer. Might need some code changes " \ "to support multiple layers." @@ -317,4 +319,4 @@ def patch_pangu(): from omni.adaptors.vllm.entrypoints.openai.tool_parsers import register_tool register_reasoning() register_tool() - print("++++++++++++++++++++++patch_pangu++++++++++++++++++++++++++++") \ No newline at end of file + logger.info("++++++++++++++++++++++patch_pangu++++++++++++++++++++++++++++") \ No newline at end of file diff --git a/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_d.json b/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_d.json index 90b3e145aad5de8de2531a51b501a67627ad2a3c..6801ad2ebd1c595c6a1b6b0cd7aa3718d2f39b45 100644 --- a/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_d.json +++ b/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_d.json @@ -20,6 +20,7 @@ "expert_gate_up_prefetch": 24, "expert_down_prefetch": 12, "attn_prefetch": 8, - "enable_scale_parallel": true + "enable_scale_parallel": true, + "use_tnd_pa": true } } \ No newline at end of file diff --git a/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_p.json b/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_p.json index 7c3d0811d72704c6c039f189a9abbd0e527d165e..bdbfd2a29bc74dab159addab516e9ba33cf89fda 100644 --- a/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_p.json +++ b/omni/models/configs/qwen3_235b_w8a8c16_a3_2p1d_p.json @@ -12,6 +12,7 @@ "control_accept_rate": -1, "enable_prefill_micro_batch": true, "experts_pruning": false, - "enable_scale_parallel": true + "enable_scale_parallel": true, + "use_tnd_pa": true } } \ No newline at end of file diff --git a/tools/omni_cli/template/cli_qwen3_235b_w8a8c16_a3_2p1d.yaml b/tools/omni_cli/template/cli_qwen3_235b_w8a8c16_a3_2p1d.yaml index a7a57716753553e6ed76c53e4549e0a5d0b84850..7c56a3d80a98fe45443a7e07a0224be6c9a7e67f 100644 --- a/tools/omni_cli/template/cli_qwen3_235b_w8a8c16_a3_2p1d.yaml +++ b/tools/omni_cli/template/cli_qwen3_235b_w8a8c16_a3_2p1d.yaml @@ -37,7 +37,7 @@ all: enable-expert-parallel: '' max-num-seqs: 32 no-enable-prefix-caching: '' - max-num-batched-tokens: 40960 + max-num-batched-tokens: 66560 gpu-util: '0.92' kv-transfer-config: engine_id: 2 @@ -118,7 +118,7 @@ all: LOCAL_DECODE_SERVER_IP_LIST: ${SERVER_IP_LIST} LOG_PATH: /tmp/log_path MASTER_PORT: 8000 - MODEL_LEN_MAX_DECODE: 40960 + MODEL_LEN_MAX_DECODE: 66560 MODEL_PATH: /data/models/Qwen3_235B_w8a8 OMNI_USE_DSV3: 1 PREFILL_POD_NUM: 2 @@ -159,7 +159,7 @@ all: enable-expert-parallel: '' max-num-seqs: 32 no-enable-prefix-caching: '' - max-num-batched-tokens: 40960 + max-num-batched-tokens: 66560 gpu-util: '0.92' kv-transfer-config: engine_id: 2 @@ -240,7 +240,7 @@ all: LOCAL_DECODE_SERVER_IP_LIST: ${SERVER_IP_LIST} LOG_PATH: /tmp/log_path MASTER_PORT: 8000 - MODEL_LEN_MAX_DECODE: 40960 + MODEL_LEN_MAX_DECODE: 66560 MODEL_PATH: /data/models/Qwen3_235B_w8a8 OMNI_USE_DSV3: 1 PREFILL_POD_NUM: 2 @@ -281,7 +281,7 @@ all: enable-expert-parallel: '' max-num-seqs: 32 no-enable-prefix-caching: '' - max-num-batched-tokens: 40960 + max-num-batched-tokens: 66560 gpu-util: '0.92' kv-transfer-config: engine_id: 2 @@ -362,7 +362,7 @@ all: LOCAL_DECODE_SERVER_IP_LIST: ${SERVER_IP_LIST} LOG_PATH: /tmp/log_path MASTER_PORT: 8000 - MODEL_LEN_MAX_DECODE: 40960 + MODEL_LEN_MAX_DECODE: 66560 MODEL_PATH: /data/models/Qwen3_235B_w8a8 OMNI_USE_DSV3: 1 PREFILL_POD_NUM: 2 @@ -403,7 +403,7 @@ all: enable-expert-parallel: '' max-num-seqs: 32 no-enable-prefix-caching: '' - max-num-batched-tokens: 40960 + max-num-batched-tokens: 66560 gpu-util: '0.92' kv-transfer-config: engine_id: 2 @@ -484,7 +484,7 @@ all: LOCAL_DECODE_SERVER_IP_LIST: ${SERVER_IP_LIST} LOG_PATH: /tmp/log_path MASTER_PORT: 8000 - MODEL_LEN_MAX_DECODE: 40960 + MODEL_LEN_MAX_DECODE: 66560 MODEL_PATH: /data/models/Qwen3_235B_w8a8 OMNI_USE_DSV3: 1 PREFILL_POD_NUM: 2 @@ -522,10 +522,10 @@ all: disable-log-requests: '' enable-expert-parallel: '' enforce-eager: '' - max-num-batched-tokens: 40960 + max-num-batched-tokens: 66560 max-num-seqs: 32 no-enable-prefix-caching: '' - gpu-util: '0.85' + gpu-util: '0.8' kv-transfer-config: engine_id: 0 kv_buffer_device: npu @@ -605,7 +605,7 @@ all: LOCAL_DECODE_SERVER_IP_LIST: '' LOG_PATH: /tmp/log_path MASTER_PORT: 8000 - MODEL_LEN_MAX_PREFILL: 40960 + MODEL_LEN_MAX_PREFILL: 66560 MODEL_PATH: /data/models/Qwen3_235B_w8a8 OMNI_USE_DSV3: 1 PREFILL_POD_NUM: 2 @@ -644,10 +644,10 @@ all: disable-log-requests: '' enable-expert-parallel: '' enforce-eager: '' - max-num-batched-tokens: 40960 + max-num-batched-tokens: 66560 max-num-seqs: 32 no-enable-prefix-caching: '' - gpu-util: '0.85' + gpu-util: '0.8' kv-transfer-config: engine_id: 1 kv_buffer_device: npu @@ -727,7 +727,7 @@ all: LOCAL_DECODE_SERVER_IP_LIST: '' LOG_PATH: /tmp/log_path MASTER_PORT: 8000 - MODEL_LEN_MAX_PREFILL: 40960 + MODEL_LEN_MAX_PREFILL: 66560 MODEL_PATH: /data/models/Qwen3_235B_w8a8 OMNI_USE_DSV3: 1 PREFILL_POD_NUM: 2