diff --git a/cli/convert_ckpt.py b/cli/convert_ckpt.py
index 728eaab729131c3836865e98e4c3b94936b231fc..65dec11dddf37760e9a2cfba7468a1d62eb7597e 100644
--- a/cli/convert_ckpt.py
+++ b/cli/convert_ckpt.py
@@ -169,6 +169,10 @@ def main():
help='use bitsandbytes nf4 to quantize model.')
parser.add_argument('--orm', action="store_true", default=False,
help='Specify the ORM ckpt conversion, convert additional rm_head layer in ORM.')
+ parser.add_argument('--save-lora-to-hf', action='store_true', default=False,
+ help='Enable only save lora-checkpoint to hf')
+ parser.add_argument('--load-checkpoint-loosely', action='store_true', default=False,
+ help='Enable loading checkpoint not strictly.')
known_args, _ = parser.parse_known_args()
diff --git a/docs/features/grpo_yaml.md b/docs/features/grpo_yaml.md
index e4bca60977d10cae0c5a8c0b5ca5fdb16fee0159..6387b501df4797cc893df9f171e811ff8969d83c 100644
--- a/docs/features/grpo_yaml.md
+++ b/docs/features/grpo_yaml.md
@@ -102,4 +102,24 @@ vllm 模型参数 可以参照 [vllm官网参数介绍](https://docs.vllm.ai/en/
* `top_k`:vllm 会先选出概率最高的 top_k 个 token,然后在这 top_k 个 token 范围内进行采样;
* `min_p`:vllm 过滤掉概率低于 min_p 的词元,不参与后续的采样过程;
* `temperature`:采样时的随机性参数;
-* `detokenize`:是否将输出token重新转为文本;
\ No newline at end of file
+* `detokenize`:是否将输出token重新转为文本;
+
+### runtime_env 环境变量
+**( 注:位于 configs/envs/runtime_env.yaml 中 )**
+* `RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES`:是否禁用 Ray 对 ASCEND_RT_VISIBLE_DEVICES 的自动设置,'true'为禁用
+* `TOKENIZERS_PARALLELISM`:设置tokenizers是否支持并行,'true'为支持
+* `NCCL_DEBUG`:NCCL Debug日志级别,VERSION、WARN、INFO、TRACE
+* `PYTORCH_NPU_ALLOC_CONF`:设置缓存分配器行为
+* `HCCL_CONNECT_TIMEOUT`:HCCL 连接超时时间
+* `HCCL_EXEC_TIMEOUT`:HCCL 执行超时时间
+* `HCCL_IF_BASE_PORT`:HCCL 通信端口
+* `CUDA_DEVICE_MAX_CONNECTIONS`:设备最大连接数
+* `HYDRA_FULL_ERROR`:设置 HYDRA 是否输出完整错误日志
+* `VLLM_DP_SIZE`:vLLM数据并行度(Data Parallelism)大小,控制数据分片数量,MOE模型建议和EP一致,稠密模型设置为1
+* `HCCL_BUFFSIZE`:HCCL通信层单次传输的最大缓冲区大小(单位MB),影响跨设备通信效率
+* `VLLM_USE_V1`:使用vLLM的V1 engine API(v1接口),兼容性选项
+* `VLLM_VERSION`:指定使用的vLLM版本号
+* `VLLM_ENABLE_GRAPH_MODE`:启用昇腾torchair图模式优化(1=启用),提升执行效率
+* `VLLM_ENABLE_MC2`:是否启用vLLM的通算融合算子调度策略
+* `HCCL_OP_EXPANSION_MODE`:HCCL算子扩展模式(AIV=AI向量模式),启用高级通信优化
+* `VLLM_ENABLE_TOPK_OPTIMZE`:使能vLLM TOPK性能优化
diff --git a/docs/install_guide.md b/docs/install_guide.md
index 7f2b363dcf1b645081be88ef89a6c5cae2d7ea3e..bdf80f2c1052deb718b3960e838fd9240d751a19 100644
--- a/docs/install_guide.md
+++ b/docs/install_guide.md
@@ -59,7 +59,7 @@ MindSpeed RL支持Atlas 800T A2等昇腾训练硬件形态。软件版本配套
vllm |
- 0.7.3 |
+ main |
@@ -99,8 +99,8 @@ pip install apex-0.1.dev*.whl
git clone https://github.com/vllm-project/vllm.git
cd vllm
git checkout 5bc1ad6cee754405464a9957e86cf3a9302e4986
-pip install -r requirements-build.txt
VLLM_TARGET_DEVICE=empty pip install .
+cd ..
```
### vllm_ascend安装
diff --git a/tests/st/resharding/test_module_entry_resharding.sh b/tests/st/resharding/test_module_entry_resharding.sh
deleted file mode 100644
index 100e60627f61a1b5abf0dea44924fa2ba15276a9..0000000000000000000000000000000000000000
--- a/tests/st/resharding/test_module_entry_resharding.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-# #!/bin/bash
-
-# export CUDA_DEVICE_MAX_CONNECTIONS=1
-# SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
-# export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
-# export VLLM_DP_SIZE=1
-# export HCCL_BUFFSIZE=256
-# export VLLM_USE_V1=1
-# export VLLM_VERSION=0.9.0
-# export VLLM_ENABLE_GRAPH_MODE=0
-# export VLLM_ENABLE_MC2=0
-# export HCCL_OP_EXPANSION_MODE="AIV"
-# export VLLM_ENABLE_TOPK_OPTIMZE=1
-
-# GPUS_PER_NODE=8
-# MASTER_ADDR=localhost
-# MASTER_PORT=6555
-# NNODES=1
-# NODE_RANK=0
-# WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-# DISTRIBUTED_ARGS="
-# --nproc_per_node $GPUS_PER_NODE \
-# --nnodes $NNODES \
-# --node_rank $NODE_RANK \
-# --master_addr $MASTER_ADDR \
-# --master_port $MASTER_PORT
-# "
-# PYTHON_ARGS="
-# --model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \
-# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
-# --train-tp 4 \
-# --train-pp 2 \
-# --train-ep 1 \
-# --infer-tp 2 \
-# --infer-pp 1 \
-# --infer-ep 1
-# "
-# PYTHON_ARGS_new="
-# --model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \
-# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
-# --train-tp 2 \
-# --train-pp 2 \
-# --train-ep 1 \
-# --infer-tp 4 \
-# --infer-pp 1 \
-# --infer-ep 1
-# "
-
-# echo "start test_resharding st"
-
-# torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS
-
-# torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS_new
\ No newline at end of file
diff --git a/tests/st/resharding/test_module_entry_resharding_tp_expand.sh b/tests/st/resharding/test_module_entry_resharding_tp_expand.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7f56a5b8f8a39cf424658e894245c51b725e235c
--- /dev/null
+++ b/tests/st/resharding/test_module_entry_resharding_tp_expand.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
+export VLLM_DP_SIZE=1
+export HCCL_BUFFSIZE=256
+export VLLM_USE_V1=1
+export VLLM_VERSION=0.9.0
+export VLLM_ENABLE_GRAPH_MODE=0
+export VLLM_ENABLE_MC2=0
+export HCCL_OP_EXPANSION_MODE="AIV"
+export VLLM_ENABLE_TOPK_OPTIMZE=1
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6555
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="
+ --nproc_per_node $GPUS_PER_NODE \
+ --nnodes $NNODES \
+ --node_rank $NODE_RANK \
+ --master_addr $MASTER_ADDR \
+ --master_port $MASTER_PORT
+"
+
+PYTHON_ARGS="
+ --model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \
+ --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
+ --train-tp 2 \
+ --train-pp 2 \
+ --train-ep 1 \
+ --infer-tp 4 \
+ --infer-pp 1 \
+ --infer-ep 1
+"
+
+echo "start test_resharding st: tp expand"
+
+torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS
+
diff --git a/tests/st/resharding/test_module_entry_resharding_tp_reduce.sh b/tests/st/resharding/test_module_entry_resharding_tp_reduce.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dd2c3c99c9cc4f17f85fa83c2eb49d8f2b92862b
--- /dev/null
+++ b/tests/st/resharding/test_module_entry_resharding_tp_reduce.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
+export VLLM_DP_SIZE=1
+export HCCL_BUFFSIZE=256
+export VLLM_USE_V1=1
+export VLLM_VERSION=0.9.0
+export VLLM_ENABLE_GRAPH_MODE=0
+export VLLM_ENABLE_MC2=0
+export HCCL_OP_EXPANSION_MODE="AIV"
+export VLLM_ENABLE_TOPK_OPTIMZE=1
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6555
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="
+ --nproc_per_node $GPUS_PER_NODE \
+ --nnodes $NNODES \
+ --node_rank $NODE_RANK \
+ --master_addr $MASTER_ADDR \
+ --master_port $MASTER_PORT
+"
+PYTHON_ARGS="
+ --model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \
+ --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
+ --train-tp 4 \
+ --train-pp 2 \
+ --train-ep 1 \
+ --infer-tp 2 \
+ --infer-pp 1 \
+ --infer-ep 1
+"
+
+echo "start test_resharding st: tp reduce"
+
+torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS
\ No newline at end of file
diff --git a/tests/st/st_run.sh b/tests/st/st_run.sh
index 3a876373e870e31bf308d2741b6ac759520f569b..8d8589cae662d481c71112b51437a589aa6c27ea 100644
--- a/tests/st/st_run.sh
+++ b/tests/st/st_run.sh
@@ -13,4 +13,10 @@ for script in $test_scripts; do
echo "脚本执行失败: $script"
exit 1
fi
+
+ echo "任务执行完成": $script
+ ray stop
+ ps -ef | grep torchrun | grep -v grep | awk '{print $2}' | xargs -r kill -9
+ echo "计算资源清理完成"
+
done
\ No newline at end of file