diff --git a/cli/convert_ckpt.py b/cli/convert_ckpt.py index 728eaab729131c3836865e98e4c3b94936b231fc..65dec11dddf37760e9a2cfba7468a1d62eb7597e 100644 --- a/cli/convert_ckpt.py +++ b/cli/convert_ckpt.py @@ -169,6 +169,10 @@ def main(): help='use bitsandbytes nf4 to quantize model.') parser.add_argument('--orm', action="store_true", default=False, help='Specify the ORM ckpt conversion, convert additional rm_head layer in ORM.') + parser.add_argument('--save-lora-to-hf', action='store_true', default=False, + help='Enable only save lora-checkpoint to hf') + parser.add_argument('--load-checkpoint-loosely', action='store_true', default=False, + help='Enable loading checkpoint not strictly.') known_args, _ = parser.parse_known_args() diff --git a/docs/features/grpo_yaml.md b/docs/features/grpo_yaml.md index e4bca60977d10cae0c5a8c0b5ca5fdb16fee0159..6387b501df4797cc893df9f171e811ff8969d83c 100644 --- a/docs/features/grpo_yaml.md +++ b/docs/features/grpo_yaml.md @@ -102,4 +102,24 @@ vllm 模型参数 可以参照 [vllm官网参数介绍](https://docs.vllm.ai/en/ * `top_k`:vllm 会先选出概率最高的 top_k 个 token,然后在这 top_k 个 token 范围内进行采样; * `min_p`:vllm 过滤掉概率低于 min_p 的词元,不参与后续的采样过程; * `temperature`:采样时的随机性参数; -* `detokenize`:是否将输出token重新转为文本; \ No newline at end of file +* `detokenize`:是否将输出token重新转为文本; + +### runtime_env 环境变量 +**( 注:位于 configs/envs/runtime_env.yaml 中 )** +* `RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES`:是否禁用 Ray 对 ASCEND_RT_VISIBLE_DEVICES 的自动设置,'true'为禁用 +* `TOKENIZERS_PARALLELISM`:设置tokenizers是否支持并行,'true'为支持 +* `NCCL_DEBUG`:NCCL Debug日志级别,VERSION、WARN、INFO、TRACE +* `PYTORCH_NPU_ALLOC_CONF`:设置缓存分配器行为 +* `HCCL_CONNECT_TIMEOUT`:HCCL 连接超时时间 +* `HCCL_EXEC_TIMEOUT`:HCCL 执行超时时间 +* `HCCL_IF_BASE_PORT`:HCCL 通信端口 +* `CUDA_DEVICE_MAX_CONNECTIONS`:设备最大连接数 +* `HYDRA_FULL_ERROR`:设置 HYDRA 是否输出完整错误日志 +* `VLLM_DP_SIZE`:vLLM数据并行度(Data Parallelism)大小,控制数据分片数量,MOE模型建议和EP一致,稠密模型设置为1 +* `HCCL_BUFFSIZE`:HCCL通信层单次传输的最大缓冲区大小(单位MB),影响跨设备通信效率 +* `VLLM_USE_V1`:使用vLLM的V1 engine API(v1接口),兼容性选项 +* `VLLM_VERSION`:指定使用的vLLM版本号 +* `VLLM_ENABLE_GRAPH_MODE`:启用昇腾torchair图模式优化(1=启用),提升执行效率 +* `VLLM_ENABLE_MC2`:是否启用vLLM的通算融合算子调度策略 +* `HCCL_OP_EXPANSION_MODE`:HCCL算子扩展模式(AIV=AI向量模式),启用高级通信优化 +* `VLLM_ENABLE_TOPK_OPTIMZE`:使能vLLM TOPK性能优化 diff --git a/docs/install_guide.md b/docs/install_guide.md index 7f2b363dcf1b645081be88ef89a6c5cae2d7ea3e..bdf80f2c1052deb718b3960e838fd9240d751a19 100644 --- a/docs/install_guide.md +++ b/docs/install_guide.md @@ -59,7 +59,7 @@ MindSpeed RL支持Atlas 800T A2等昇腾训练硬件形态。软件版本配套 vllm - 0.7.3 + main @@ -99,8 +99,8 @@ pip install apex-0.1.dev*.whl git clone https://github.com/vllm-project/vllm.git cd vllm git checkout 5bc1ad6cee754405464a9957e86cf3a9302e4986 -pip install -r requirements-build.txt VLLM_TARGET_DEVICE=empty pip install . +cd .. ``` ### vllm_ascend安装 diff --git a/tests/st/resharding/test_module_entry_resharding.sh b/tests/st/resharding/test_module_entry_resharding.sh deleted file mode 100644 index 100e60627f61a1b5abf0dea44924fa2ba15276a9..0000000000000000000000000000000000000000 --- a/tests/st/resharding/test_module_entry_resharding.sh +++ /dev/null @@ -1,54 +0,0 @@ -# #!/bin/bash - -# export CUDA_DEVICE_MAX_CONNECTIONS=1 -# SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) -# export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH -# export VLLM_DP_SIZE=1 -# export HCCL_BUFFSIZE=256 -# export VLLM_USE_V1=1 -# export VLLM_VERSION=0.9.0 -# export VLLM_ENABLE_GRAPH_MODE=0 -# export VLLM_ENABLE_MC2=0 -# export HCCL_OP_EXPANSION_MODE="AIV" -# export VLLM_ENABLE_TOPK_OPTIMZE=1 - -# GPUS_PER_NODE=8 -# MASTER_ADDR=localhost -# MASTER_PORT=6555 -# NNODES=1 -# NODE_RANK=0 -# WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) - -# DISTRIBUTED_ARGS=" -# --nproc_per_node $GPUS_PER_NODE \ -# --nnodes $NNODES \ -# --node_rank $NODE_RANK \ -# --master_addr $MASTER_ADDR \ -# --master_port $MASTER_PORT -# " -# PYTHON_ARGS=" -# --model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \ -# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \ -# --train-tp 4 \ -# --train-pp 2 \ -# --train-ep 1 \ -# --infer-tp 2 \ -# --infer-pp 1 \ -# --infer-ep 1 -# " -# PYTHON_ARGS_new=" -# --model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \ -# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \ -# --train-tp 2 \ -# --train-pp 2 \ -# --train-ep 1 \ -# --infer-tp 4 \ -# --infer-pp 1 \ -# --infer-ep 1 -# " - -# echo "start test_resharding st" - -# torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS - -# torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS_new \ No newline at end of file diff --git a/tests/st/resharding/test_module_entry_resharding_tp_expand.sh b/tests/st/resharding/test_module_entry_resharding_tp_expand.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f56a5b8f8a39cf424658e894245c51b725e235c --- /dev/null +++ b/tests/st/resharding/test_module_entry_resharding_tp_expand.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH +export VLLM_DP_SIZE=1 +export HCCL_BUFFSIZE=256 +export VLLM_USE_V1=1 +export VLLM_VERSION=0.9.0 +export VLLM_ENABLE_GRAPH_MODE=0 +export VLLM_ENABLE_MC2=0 +export HCCL_OP_EXPANSION_MODE="AIV" +export VLLM_ENABLE_TOPK_OPTIMZE=1 + +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6555 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +PYTHON_ARGS=" + --model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \ + --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \ + --train-tp 2 \ + --train-pp 2 \ + --train-ep 1 \ + --infer-tp 4 \ + --infer-pp 1 \ + --infer-ep 1 +" + +echo "start test_resharding st: tp expand" + +torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS + diff --git a/tests/st/resharding/test_module_entry_resharding_tp_reduce.sh b/tests/st/resharding/test_module_entry_resharding_tp_reduce.sh new file mode 100644 index 0000000000000000000000000000000000000000..dd2c3c99c9cc4f17f85fa83c2eb49d8f2b92862b --- /dev/null +++ b/tests/st/resharding/test_module_entry_resharding_tp_reduce.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH +export VLLM_DP_SIZE=1 +export HCCL_BUFFSIZE=256 +export VLLM_USE_V1=1 +export VLLM_VERSION=0.9.0 +export VLLM_ENABLE_GRAPH_MODE=0 +export VLLM_ENABLE_MC2=0 +export HCCL_OP_EXPANSION_MODE="AIV" +export VLLM_ENABLE_TOPK_OPTIMZE=1 + +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6555 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" +PYTHON_ARGS=" + --model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \ + --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \ + --train-tp 4 \ + --train-pp 2 \ + --train-ep 1 \ + --infer-tp 2 \ + --infer-pp 1 \ + --infer-ep 1 +" + +echo "start test_resharding st: tp reduce" + +torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS \ No newline at end of file diff --git a/tests/st/st_run.sh b/tests/st/st_run.sh index 3a876373e870e31bf308d2741b6ac759520f569b..8d8589cae662d481c71112b51437a589aa6c27ea 100644 --- a/tests/st/st_run.sh +++ b/tests/st/st_run.sh @@ -13,4 +13,10 @@ for script in $test_scripts; do echo "脚本执行失败: $script" exit 1 fi + + echo "任务执行完成": $script + ray stop + ps -ef | grep torchrun | grep -v grep | awk '{print $2}' | xargs -r kill -9 + echo "计算资源清理完成" + done \ No newline at end of file