diff --git a/configs/dapo_qwen3_30b_a3b_A3.yaml b/configs/dapo_qwen3_30b_a3b_A3.yaml index f60b4ef56ed5e20abf916bab0f70d0a7bb653586..8f00ebd0b281ed6e77cf206b4a36a3a3fe55409d 100644 --- a/configs/dapo_qwen3_30b_a3b_A3.yaml +++ b/configs/dapo_qwen3_30b_a3b_A3.yaml @@ -41,7 +41,7 @@ actor_config: moe_tp_extend_ep: true gemm_gradient_accumulation_fusion: true tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 1 + pipeline_model_parallel_size: 2 expert_model_parallel_size: 2 lr: 1e-6 lr_decay_style: constant @@ -108,7 +108,8 @@ rl_config: num_npus: 16 generate_config: - enforce_eager: true + enforce_eager: false + torchair_graph: true enable_expert_parallel: true trust_remote_code: true offload_train_optimizer: true @@ -118,10 +119,10 @@ generate_config: # 推理时的并行配置 infer_tensor_parallel_size: 1 infer_pipeline_parallel_size: 1 - infer_expert_parallel_size: 8 # 同步修改runtime_env.yaml中VLLM_DP_SIZE的值,建议保持一致 + infer_expert_parallel_size: 16 # 同步修改runtime_env.yaml中VLLM_DP_SIZE的值,建议保持一致 # vllm 模型相关设置 - max_num_seqs: 1024 + max_num_seqs: 16 max_model_len: 4096 max_num_batched_tokens: 8192 dtype: "bfloat16" diff --git a/configs/envs/runtime_env.yaml b/configs/envs/runtime_env.yaml index b0e750774e3311adc1c88e309dd6ba0aaae51e08..d3199167b0ee1ea06067123dc60b01a70d499966 100644 --- a/configs/envs/runtime_env.yaml +++ b/configs/envs/runtime_env.yaml @@ -18,11 +18,11 @@ env_vars: #设置 HYDRA 是否输出完整错误日志 HYDRA_FULL_ERROR: '1' # vLLM数据并行度(Data Parallelism)大小,控制数据分片数量,稠密模型需要设置为1,MOE模型要求必须和EP一致 - VLLM_DP_SIZE: '1' + VLLM_DP_SIZE: '16' # 指定不使用 LCCL 通信 USING_LCCL_COM: '0' # HCCL通信层单次传输的最大缓冲区大小(单位MB),影响跨设备通信效率 - HCCL_BUFFSIZE: '256' + HCCL_BUFFSIZE: '1280' # 使用vLLM的V1 engine API(v1接口),兼容性选项 VLLM_USE_V1: '1' # 指定使用的vLLM版本号 @@ -37,6 +37,8 @@ env_vars: CPU_AFFINITY_CONF: '1' # COC指定环境变量 LCAL_COMM_ID: '127.0.0.1:27001' + # HCCL通信模式 + HCCL_OP_EXPANSION_MODE: "AIV" #指定 GLOO 框架通信网卡 # GLOO_SOCKET_IFNAME: "Your SOCKET IFNAME" #指定 TP 相关通信网卡