diff --git a/frameworks/Ray/2.54.0/Dockerfile b/frameworks/Ray/2.54.0/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..e639344dd34268fc14ac926bcdf73a53f1cab5c5 --- /dev/null +++ b/frameworks/Ray/2.54.0/Dockerfile @@ -0,0 +1,103 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="Ray all components + Torch GPU on OpenCloudOS 9 CUDA 12.8" + +# ========================= +# 版本参数 +# ========================= +ARG RAY_VERSION=2.54.0 +ARG TORCH_VERSION=2.11.0 +ARG TORCHVISION_VERSION=0.26.0 +ARG TORCHAUDIO_VERSION=2.11.0 +ARG PYTORCH_INDEX_URL=https://download.pytorch.org/whl/cu128 + +# ========================= +# 基础环境变量 +# ========================= +ENV PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility \ + RAY_DISABLE_USAGE_STATS=1 + +# ========================= +# Ray 集群默认环境变量 +# 这些变量会被 start-ray.sh 使用 +# ========================= +ENV RAY_NODE_TYPE=single \ + RAY_HEAD_ADDRESS="" \ + RAY_NODE_IP_ADDRESS="" \ + RAY_HEAD_PORT=6379 \ + RAY_DASHBOARD_HOST=0.0.0.0 \ + RAY_DASHBOARD_PORT=8265 \ + RAY_CLIENT_SERVER_PORT=10001 \ + RAY_SERVE_HTTP_PORT=8000 \ + RAY_NODE_MANAGER_PORT=8076 \ + RAY_OBJECT_MANAGER_PORT=8077 \ + RAY_RUNTIME_ENV_AGENT_PORT=8078 \ + RAY_DASHBOARD_AGENT_GRPC_PORT=8079 \ + RAY_DASHBOARD_AGENT_LISTEN_PORT=8080 \ + RAY_METRICS_EXPORT_PORT=8081 \ + RAY_MIN_WORKER_PORT=10002 \ + RAY_MAX_WORKER_PORT=10100 \ + RAY_TEMP_DIR=/tmp/ray \ + RAY_NUM_CPUS="" \ + RAY_NUM_GPUS="" \ + RAY_OBJECT_STORE_MEMORY="" \ + RAY_RESOURCES="" + +WORKDIR /home + +# ========================= +# 安装 Ray 全组件 +# ========================= +RUN python3 -m pip install \ + "ray[all,client,serve-grpc]==${RAY_VERSION}" + +# ========================= +# 安装 PyTorch GPU 版本 +# 注意:如果 torch==2.11.0 / torchvision==0.26.0 当前源里不存在,构建会失败。 +# 可以通过 docker build --build-arg 修改版本。 +# ========================= +RUN python3 -m pip install \ + torch==${TORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url ${PYTORCH_INDEX_URL} + +# ========================= +# 可选工具:进度条、GPU 监控、排错工具 +# ========================= +RUN python3 -m pip install \ + tqdm \ + gputil \ + psutil \ + requests \ + fastapi \ + uvicorn + +# ========================= +# 拷贝测试脚本和启动脚本 +# ========================= +COPY ./test_ray.py /home/test_ray.py +COPY ./start-ray.sh /usr/local/bin/start-ray.sh + +RUN chmod +x /usr/local/bin/start-ray.sh + + +# ========================= +# Ray 常用端口 +# 6379 : Ray Head / GCS +# 8265 : Ray Dashboard +# 10001 : Ray Client +# 8000 : Ray Serve HTTP +# 8076-8081 : 固定 Ray 内部组件端口 +# 10002-10100 : Ray Worker 端口范围 +# ========================= +EXPOSE 6379 8265 10001 8000 8076 8077 8078 8079 8080 8081 10002-10100 + +#ENTRYPOINT ["/usr/local/bin/start-ray.sh"] 在正式merge后,需要将这行注释打开 +CMD ["/usr/local/bin/start-ray.sh"] \ No newline at end of file diff --git a/frameworks/Ray/2.54.0/README.md b/frameworks/Ray/2.54.0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d552a79e9463f52aefd315a16d373387b55f9f02 --- /dev/null +++ b/frameworks/Ray/2.54.0/README.md @@ -0,0 +1,409 @@ +# Ray 2.55.1 + Torch 2.11.0 on OpenCloudOS 9 + +## 基本信息 + +- **Ray 版本**:v2.54.0 +- **Torch 版本**:v2.11.0 +- **TorchVision 版本**:v0.26.0 +- **TorchAudio 版本**:v2.11.0 +- **基础镜像**:opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**:12.8 +- **GPU 支持**:NVIDIA GPU / CUDA +- **Ray 安装组件**:ray[all,client,serve-grpc] + +## 适用场景 + +- Ray 单机任务调度 +- Ray 多机集群任务调度 +- Ray Data 数据处理 +- Ray Train 分布式训练 +- Ray Tune 参数调优 +- Ray Serve 模型服务 +- Ray RLlib 强化学习 +- PyTorch GPU 训练 / 推理 + +--- + +## 构建 + +```bash +docker build -t oc9-ray:2.54.0 . +``` + +也可以通过构建参数指定 Ray 和 Torch 版本: + +```bash +docker build \ + --build-arg RAY_VERSION=2.54.0 \ + --build-arg TORCH_VERSION=2.11.0 \ + --build-arg TORCHVISION_VERSION=0.26.0 \ + --build-arg TORCHAUDIO_VERSION=2.11.0 \ + -t oc9-ray:2.54.0 . +``` + +--- + +## 镜像启动命令 + +### 单机模式启动 + +单机模式会在当前容器内启动一个 Ray Head 节点,适合本地开发、单机测试、单机 GPU 任务。 + +```bash +docker run -d \ + --gpus all \ + --shm-size=8g \ + --name oc9-ray-single \ + -e RAY_NODE_TYPE=single \ + -p 6379:6379 \ + -p 8265:8265 \ + -p 10001:10001 \ + -p 8000:8000 \ + oc9-ray:2.54.0 +``` + +查看 Ray Dashboard: + +```text +http://宿主机IP:8265 +``` + +进入容器: + +```bash +docker exec -it oc9-ray-single bash +``` + +查看 Ray 集群状态: + +```bash +ray status +``` + +--- + +### 多机集群模式启动 + +Ray 多机集群由一个 Head 节点和多个 Worker 节点组成。 + +推荐使用 `--network=host`,避免 Docker bridge 网络导致 Ray 节点之间无法互相访问。 + +假设机器 IP 如下: + +| 角色 | IP | +|---|---| +| Head 节点 | 192.168.1.10 | +| Worker 节点 1 | 192.168.1.11 | +| Worker 节点 2 | 192.168.1.12 | + +--- + +#### 启动 Head 节点 + +在 `192.168.1.10` 上执行: + +```bash +docker run -d \ + --gpus all \ + --network=host \ + --shm-size=8g \ + --name oc9-ray-head \ + -e RAY_NODE_TYPE=head \ + -e RAY_NODE_IP_ADDRESS=192.168.1.10 \ + -e RAY_NUM_GPUS=1 \ + oc9-ray:2.55.1 +``` + +查看 Head 节点日志: + +```bash +docker logs -f oc9-ray-head +``` + +查看 Dashboard: + +```text +http://192.168.1.10:8265 +``` + +--- + +#### 启动 Worker 节点 + +在 `192.168.1.11` 上执行: + +```bash +docker run -d \ + --gpus all \ + --network=host \ + --shm-size=8g \ + --name oc9-ray-worker-1 \ + -e RAY_NODE_TYPE=worker \ + -e RAY_HEAD_ADDRESS=192.168.1.10:6379 \ + -e RAY_NODE_IP_ADDRESS=192.168.1.11 \ + -e RAY_NUM_GPUS=1 \ + oc9-ray:2.54.0 +``` + +在 `192.168.1.12` 上执行: + +```bash +docker run -d \ + --gpus all \ + --network=host \ + --shm-size=8g \ + --name oc9-ray-worker-2 \ + -e RAY_NODE_TYPE=worker \ + -e RAY_HEAD_ADDRESS=192.168.1.10:6379 \ + -e RAY_NODE_IP_ADDRESS=192.168.1.12 \ + -e RAY_NUM_GPUS=1 \ + oc9-ray:2.54.0 +``` + +查看 Worker 节点日志: + +```bash +docker logs -f oc9-ray-worker-1 +docker logs -f oc9-ray-worker-2 +``` + +--- + +## 镜像测试命令 + +### 单机完整测试 + +```bash +docker run --rm \ + --gpus all \ + --shm-size=8g \ + -e RAY_NODE_TYPE=test \ + oc9-ray:2.54.0 \ + python3 test_ray.py --full --require-gpu +``` + +### 在已有 Head 节点中测试 + +```bash +docker exec -it oc9-ray-head bash +python3 /home/test_ray.py --address auto --full --require-gpu +``` + +### 通过 Ray Client 连接测试 + +```bash +python3 test_ray.py --address ray://192.168.1.10:10001 --full --require-gpu +``` + +--- + +## 常用端口 + +| 端口 | 说明 | +|---:|---| +| 6379 | Ray Head / GCS 端口 | +| 8265 | Ray Dashboard 端口 | +| 10001 | Ray Client 端口 | +| 8000 | Ray Serve HTTP 端口 | +| 8076 | Ray Node Manager 端口 | +| 8077 | Ray Object Manager 端口 | +| 8078 | Ray Runtime Env Agent 端口 | +| 8079 | Ray Dashboard Agent gRPC 端口 | +| 8080 | Ray Dashboard Agent HTTP 端口 | +| 8081 | Ray Metrics Export 端口 | +| 10002-10100 | Ray Worker 进程端口范围 | + +--- + +## 环境变量说明 + +| 环境变量 | 默认值 | 示例 | 说明 | +|---|---|---|---| +| `RAY_NODE_TYPE` | `single` | `head` / `worker` / `single` / `test` | 容器启动模式。`head` 表示 Head 节点,`worker` 表示 Worker 节点,`single` 表示单机模式,`test` 表示执行测试脚本 | +| `RAY_HEAD_ADDRESS` | 空 | `192.168.1.10:6379` | Worker 节点连接 Head 节点的地址。`RAY_NODE_TYPE=worker` 时必填 | +| `RAY_NODE_IP_ADDRESS` | 自动获取 | `192.168.1.11` | 当前节点对其他 Ray 节点可访问的 IP。多机部署时建议显式指定 | +| `RAY_HEAD_PORT` | `6379` | `6379` | Ray Head / GCS 监听端口 | +| `RAY_DASHBOARD_HOST` | `0.0.0.0` | `0.0.0.0` | Ray Dashboard 监听地址。容器中建议设置为 `0.0.0.0` | +| `RAY_DASHBOARD_PORT` | `8265` | `8265` | Ray Dashboard 端口 | +| `RAY_CLIENT_SERVER_PORT` | `10001` | `10001` | Ray Client 连接端口 | +| `RAY_SERVE_HTTP_PORT` | `8000` | `8000` | Ray Serve HTTP 服务端口 | +| `RAY_NODE_MANAGER_PORT` | `8076` | `8076` | Ray Node Manager 固定端口 | +| `RAY_OBJECT_MANAGER_PORT` | `8077` | `8077` | Ray Object Manager 固定端口 | +| `RAY_RUNTIME_ENV_AGENT_PORT` | `8078` | `8078` | Ray Runtime Env Agent 固定端口 | +| `RAY_DASHBOARD_AGENT_GRPC_PORT` | `8079` | `8079` | Ray Dashboard Agent gRPC 端口 | +| `RAY_DASHBOARD_AGENT_LISTEN_PORT` | `8080` | `8080` | Ray Dashboard Agent HTTP 端口 | +| `RAY_METRICS_EXPORT_PORT` | `8081` | `8081` | Ray Metrics 指标暴露端口 | +| `RAY_MIN_WORKER_PORT` | `10002` | `10002` | Ray Worker 进程端口范围下限 | +| `RAY_MAX_WORKER_PORT` | `10100` | `10100` | Ray Worker 进程端口范围上限 | +| `RAY_NUM_CPUS` | 自动检测 | `20` | 手动指定当前节点可用 CPU 数量 | +| `RAY_NUM_GPUS` | 自动检测 | `1` | 手动指定当前节点可用 GPU 数量 | +| `RAY_OBJECT_STORE_MEMORY` | 自动计算 | `8589934592` | Ray Object Store 内存大小,单位为 bytes | +| `RAY_RESOURCES` | 空 | `'{"worker": 1}'` | 自定义 Ray 资源标签 | +| `RAY_TEMP_DIR` | `/tmp/ray` | `/tmp/ray` | Ray 临时文件目录 | +| `RAY_DISABLE_USAGE_STATS` | `1` | `1` | 禁用 Ray 使用统计上报 | +| `NVIDIA_VISIBLE_DEVICES` | `all` | `all` / `0` / `0,1` | 指定容器可见 GPU | +| `NVIDIA_DRIVER_CAPABILITIES` | `compute,utility` | `compute,utility` | NVIDIA 容器运行能力,GPU 计算通常需要 `compute`,`nvidia-smi` 需要 `utility` | + +--- + +## 多机协同任务测试 + +进入 Head 容器: + +```bash +docker exec -it oc9-ray-head bash +``` + +执行: + +```bash +python3 - <<'PY' +import socket +import ray + +ray.init(address="auto") + +@ray.remote +def task(i): + return { + "task": i, + "host": socket.gethostname(), + "node_id": ray.get_runtime_context().get_node_id(), + } + +refs = [task.remote(i) for i in range(50)] +results = ray.get(refs) + +for item in results[:20]: + print(item) + +print("cluster_resources:", ray.cluster_resources()) +PY +``` + +如果 Worker 节点加入成功,`cluster_resources` 中会显示多台节点的 CPU / GPU 资源。 + +--- + +## GPU 验证 + +```bash +docker run --rm \ + --gpus all \ + --shm-size=8g \ + oc9-ray:2.54.0 \ + bash -c "nvidia-smi && python3 -c 'import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))'" +``` + +预期输出中应包含: + +```text +True +NVIDIA ... +``` + +--- + +## 版本检查 + +进入容器后执行: + +```bash +python3 - <<'PY' +import ray +import torch + +print("Ray:", ray.__version__) +print("Torch:", torch.__version__) +print("Torch CUDA:", torch.version.cuda) +print("CUDA available:", torch.cuda.is_available()) + +if torch.cuda.is_available(): + print("GPU:", torch.cuda.get_device_name(0)) +PY +``` + +--- + +## 注意事项 + +1. Docker 默认 `/dev/shm` 只有 64MB,Ray Object Store 会受到影响,建议启动容器时增加: + +```bash +--shm-size=8g +``` + +2. 多机部署建议使用: + +```bash +--network=host +``` + +3. Head 和 Worker 节点的 Ray 版本、Python 版本、CUDA 版本、PyTorch 版本应保持一致。 + +4. Worker 节点必须能访问 Head 节点的 `6379` 端口。 + +5. 多机 Docker bridge 网络模式下,Ray 可能识别到容器内网 IP,导致其他机器无法访问,因此生产部署建议显式设置: + +```bash +-e RAY_NODE_IP_ADDRESS=当前宿主机IP +``` + +6. 生产环境建议固定 Ray 内部端口和 Worker 端口范围,便于配置防火墙、安全组和网络策略。 + +7. 使用 GPU 时,宿主机必须安装 NVIDIA Driver 和 NVIDIA Container Toolkit。 + +8. 如果 Ray Dashboard 无法访问,请确认启动参数中包含: + +```bash +-e RAY_DASHBOARD_HOST=0.0.0.0 +``` + +9. 如果 Worker 节点无法加入集群,请优先检查: + - Head 节点 IP 是否正确 + - `RAY_HEAD_ADDRESS` 是否正确 + - 防火墙是否放通 `6379` + - 是否使用了 `--network=host` + - `RAY_NODE_IP_ADDRESS` 是否设置为宿主机可访问 IP + +--- + +## 常见命令 + +查看 Ray 状态: + +```bash +ray status +``` + +查看 Ray 任务: + +```bash +ray list tasks +``` + +查看 Ray 节点: + +```bash +ray list nodes +``` + +停止 Ray: + +```bash +ray stop --force +``` + +查看容器日志: + +```bash +docker logs -f oc9-ray-head +docker logs -f oc9-ray-worker-1 +``` + +删除容器: + +```bash +docker rm -f oc9-ray-head oc9-ray-worker-1 oc9-ray-worker-2 +``` \ No newline at end of file diff --git a/frameworks/Ray/2.54.0/build.conf b/frameworks/Ray/2.54.0/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..2a50914b8d638df49d47d196d6c18d070d461cbb --- /dev/null +++ b/frameworks/Ray/2.54.0/build.conf @@ -0,0 +1,4 @@ +# Ray 2.54.0 + PyTorch 2.11.0 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-ray +IMAGE_TAG=2.54.0 +GPU_TEST=true \ No newline at end of file diff --git a/frameworks/Ray/2.54.0/start-ray.sh b/frameworks/Ray/2.54.0/start-ray.sh new file mode 100644 index 0000000000000000000000000000000000000000..fe9d7a1750dda9f6fd6655ad7962ea36d7d166b7 --- /dev/null +++ b/frameworks/Ray/2.54.0/start-ray.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "============================================================" +echo "Ray Container Startup" +echo "============================================================" +echo "RAY_NODE_TYPE=${RAY_NODE_TYPE:-single}" +echo "RAY_HEAD_ADDRESS=${RAY_HEAD_ADDRESS:-}" +echo "RAY_NODE_IP_ADDRESS=${RAY_NODE_IP_ADDRESS:-}" +echo "RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}" +echo "RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}" +echo "RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}" +echo "RAY_MIN_WORKER_PORT=${RAY_MIN_WORKER_PORT:-10002}" +echo "RAY_MAX_WORKER_PORT=${RAY_MAX_WORKER_PORT:-10100}" +echo "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-}" +echo "============================================================" + +get_node_ip() { + if [[ -n "${RAY_NODE_IP_ADDRESS:-}" ]]; then + echo "${RAY_NODE_IP_ADDRESS}" + else + hostname -I | awk '{print $1}' + fi +} + +build_common_ray_args() { + local args=() + + args+=("--node-ip-address=$(get_node_ip)") + args+=("--node-manager-port=${RAY_NODE_MANAGER_PORT:-8076}") + args+=("--object-manager-port=${RAY_OBJECT_MANAGER_PORT:-8077}") + args+=("--runtime-env-agent-port=${RAY_RUNTIME_ENV_AGENT_PORT:-8078}") + args+=("--dashboard-agent-grpc-port=${RAY_DASHBOARD_AGENT_GRPC_PORT:-8079}") + args+=("--dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_LISTEN_PORT:-8080}") + args+=("--metrics-export-port=${RAY_METRICS_EXPORT_PORT:-8081}") + args+=("--min-worker-port=${RAY_MIN_WORKER_PORT:-10002}") + args+=("--max-worker-port=${RAY_MAX_WORKER_PORT:-10100}") + args+=("--temp-dir=${RAY_TEMP_DIR:-/tmp/ray}") + + if [[ -n "${RAY_NUM_CPUS:-}" ]]; then + args+=("--num-cpus=${RAY_NUM_CPUS}") + fi + + if [[ -n "${RAY_NUM_GPUS:-}" ]]; then + args+=("--num-gpus=${RAY_NUM_GPUS}") + fi + + if [[ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]]; then + args+=("--object-store-memory=${RAY_OBJECT_STORE_MEMORY}") + fi + + if [[ -n "${RAY_RESOURCES:-}" ]]; then + args+=("--resources=${RAY_RESOURCES}") + fi + + printf '%s\n' "${args[@]}" +} + +start_head() { + echo "[INFO] Starting Ray HEAD node..." + + ray stop --force || true + + mapfile -t COMMON_ARGS < <(build_common_ray_args) + + exec ray start \ + --head \ + --port="${RAY_HEAD_PORT:-6379}" \ + --dashboard-host="${RAY_DASHBOARD_HOST:-0.0.0.0}" \ + --dashboard-port="${RAY_DASHBOARD_PORT:-8265}" \ + --ray-client-server-port="${RAY_CLIENT_SERVER_PORT:-10001}" \ + "${COMMON_ARGS[@]}" \ + --block +} + +start_worker() { + echo "[INFO] Starting Ray WORKER node..." + + if [[ -z "${RAY_HEAD_ADDRESS:-}" ]]; then + echo "[ERROR] RAY_HEAD_ADDRESS is required for worker node." + echo "Example: RAY_HEAD_ADDRESS=192.168.1.10:6379" + exit 1 + fi + + ray stop --force || true + + mapfile -t COMMON_ARGS < <(build_common_ray_args) + + exec ray start \ + --address="${RAY_HEAD_ADDRESS}" \ + "${COMMON_ARGS[@]}" \ + --block +} + +start_single() { + echo "[INFO] Starting Ray SINGLE node..." + + ray stop --force || true + + mapfile -t COMMON_ARGS < <(build_common_ray_args) + + exec ray start \ + --head \ + --port="${RAY_HEAD_PORT:-6379}" \ + --dashboard-host="${RAY_DASHBOARD_HOST:-0.0.0.0}" \ + --dashboard-port="${RAY_DASHBOARD_PORT:-8265}" \ + --ray-client-server-port="${RAY_CLIENT_SERVER_PORT:-10001}" \ + "${COMMON_ARGS[@]}" \ + --block +} + +run_test() { + echo "[INFO] Running Ray test script..." + python3 /home/test_ray.py "$@" +} + +case "${RAY_NODE_TYPE:-single}" in + head) + start_head + ;; + + worker) + start_worker + ;; + + single) + start_single + ;; + + test) + shift || true + run_test "$@" + ;; + + bash|shell) + exec /bin/bash + ;; + + *) + echo "[INFO] Executing custom command: $*" + exec "$@" + ;; +esac \ No newline at end of file diff --git a/frameworks/Ray/2.54.0/test.sh b/frameworks/Ray/2.54.0/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..8e5feb7e361d499082f7d1dae43a101fab85ef08 --- /dev/null +++ b/frameworks/Ray/2.54.0/test.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +IMAGE="${1:-}" + +if [ -z "${IMAGE}" ]; then + echo "用法: bash test.sh <镜像名:标签>" + exit 1 +fi + +if ! command -v docker >/dev/null 2>&1; then + echo "✗ 未找到 docker" + exit 1 +fi + +echo "=== Ray 2.54.0 容器基础功能测试 ===" +echo "测试镜像: ${IMAGE}" + +docker run --rm --gpus all --entrypoint python3 "${IMAGE}" /home/test_ray.py --full --require-gpu \ No newline at end of file diff --git a/frameworks/Ray/2.54.0/test_ray.py b/frameworks/Ray/2.54.0/test_ray.py new file mode 100644 index 0000000000000000000000000000000000000000..7054a443e1332f5806d0e8025fd3cca6fd04d198 --- /dev/null +++ b/frameworks/Ray/2.54.0/test_ray.py @@ -0,0 +1,784 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +verify_ray_full_fixed.py + +Ray 全组件验证脚本,适配 Ray 2.55.x。 + +验证内容: +- Ray Core: init / remote task / actor / object store / wait / runtime_env / placement group +- GPU: Ray GPU scheduling / CUDA_VISIBLE_DEVICES / nvidia-smi / PyTorch CUDA +- Ray Data +- Ray Tune +- Ray Train TorchTrainer +- Ray Serve +- RLlib,可用 --full 开启 + +常用运行方式: + python verify_ray_full_fixed.py + + python verify_ray_full_fixed.py --full + + python verify_ray_full_fixed.py --require-gpu + + python verify_ray_full_fixed.py --address auto + + python verify_ray_full_fixed.py --address ray://127.0.0.1:10001 + +Docker GPU 示例: + docker run --rm -it \ + --gpus all \ + --shm-size=8g \ + your-ray-image \ + python /app/verify_ray_full_fixed.py --full --require-gpu +""" + +from __future__ import annotations + +import argparse +import importlib.util +import json +import os +import socket +import subprocess +import sys +import tempfile +import time +import traceback +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Any + + +# Ray Train V2 在新版本 Ray 中是默认方向。 +# 这里显式设置,方便结果稳定,也避免部分迁移提示。 +os.environ.setdefault("RAY_TRAIN_V2_ENABLED", "1") + + +class SkipTest(Exception): + """表示当前环境不满足该组件测试条件,测试跳过。""" + + +@dataclass +class TestResult: + name: str + status: str + message: str + seconds: float + + +def module_exists(name: str) -> bool: + return importlib.util.find_spec(name) is not None + + +def print_json(title: str, data: Any) -> None: + print(f"\n{title}") + print(json.dumps(data, indent=2, ensure_ascii=False, default=str)) + + +def run_test(name: str, func: Callable[[], str]) -> TestResult: + start = time.time() + + try: + msg = func() + seconds = time.time() - start + print(f"[PASS] {name} - {msg}") + return TestResult(name, "PASS", msg, seconds) + + except SkipTest as exc: + seconds = time.time() - start + print(f"[SKIP] {name} - {exc}") + return TestResult(name, "SKIP", str(exc), seconds) + + except Exception as exc: + seconds = time.time() - start + print(f"[FAIL] {name} - {exc}") + traceback.print_exc() + return TestResult(name, "FAIL", str(exc), seconds) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Verify Ray full installation and runtime.") + + parser.add_argument( + "--address", + default=os.environ.get("RAY_ADDRESS"), + help=( + "Ray 地址。留空表示本地启动;" + "auto 表示连接已有本地 Ray 集群;" + "ray://host:10001 表示 Ray Client。" + ), + ) + + parser.add_argument( + "--full", + action="store_true", + help="开启更重的测试,例如 RLlib PPO。", + ) + + parser.add_argument( + "--require-gpu", + action="store_true", + help="要求 Ray 和 PyTorch 必须检测到 GPU,否则 GPU 测试失败。", + ) + + parser.add_argument( + "--train-workers", + type=int, + default=1, + help="Ray Train 使用的 worker 数量,默认 1。", + ) + + parser.add_argument( + "--train-use-gpu", + action="store_true", + help="Ray Train 测试是否使用 GPU。需要 Ray 检测到足够 GPU。", + ) + + parser.add_argument( + "--skip-data", + action="store_true", + help="跳过 Ray Data 测试。", + ) + + parser.add_argument( + "--skip-tune", + action="store_true", + help="跳过 Ray Tune 测试。", + ) + + parser.add_argument( + "--skip-train", + action="store_true", + help="跳过 Ray Train 测试。", + ) + + parser.add_argument( + "--skip-serve", + action="store_true", + help="跳过 Ray Serve 测试。", + ) + + parser.add_argument( + "--skip-rllib", + action="store_true", + help="跳过 RLlib 测试。", + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + print("=" * 80) + print("Ray Full Verification - Fixed for Ray 2.55.x") + print("=" * 80) + print(f"Python: {sys.version}") + print(f"Host: {socket.gethostname()}") + print(f"PID: {os.getpid()}") + print(f"RAY_ADDRESS: {args.address or ''}") + print(f"RAY_TRAIN_V2_ENABLED: {os.environ.get('RAY_TRAIN_V2_ENABLED')}") + + if not module_exists("ray"): + print("\n[ERROR] 未安装 Ray。可执行:") + print(' pip install -U "ray[all]"') + sys.exit(2) + + import ray + + print(f"Ray version: {ray.__version__}") + + print("\n初始化 Ray ...") + + if args.address: + ray_info = ray.init(address=args.address) + else: + ray_info = ray.init() + + print(f"Ray initialized: {ray.is_initialized()}") + + try: + dashboard_url = getattr(ray_info, "dashboard_url", None) + if dashboard_url: + print(f"Dashboard URL: {dashboard_url}") + except Exception: + pass + + print_json("Cluster resources:", ray.cluster_resources()) + print_json("Available resources:", ray.available_resources()) + + results: list[TestResult] = [] + + # ------------------------------------------------------------------------- + # Ray Core + # ------------------------------------------------------------------------- + + def test_core_task() -> str: + @ray.remote + def square(x: int) -> int: + return x * x + + refs = [square.remote(i) for i in range(10)] + values = ray.get(refs) + expected = [i * i for i in range(10)] + + assert values == expected, f"✗ 失败||结果不符合预期: {values}" + + return f"✓ 通过||remote task 正常,结果={values}" + + results.append(run_test("Ray Core - Remote Task", test_core_task)) + + def test_actor() -> str: + @ray.remote + class Counter: + def __init__(self) -> None: + self.value = 0 + + def inc(self, n: int = 1) -> int: + self.value += n + return self.value + + def get(self) -> int: + return self.value + + counter = Counter.remote() + + assert ray.get(counter.inc.remote()) == 1 + assert ray.get(counter.inc.remote(5)) == 6 + assert ray.get(counter.get.remote()) == 6 + + return "✓ 通过||actor 状态保持正常" + + results.append(run_test("Ray Core - Actor", test_actor)) + + def test_object_store() -> str: + payload = { + "message": "hello ray object store", + "numbers": list(range(1000)), + } + + ref = ray.put(payload) + got = ray.get(ref) + + assert got == payload + + return f"✓ 通过|| ray.put/ray.get 正常,numbers={len(got['numbers'])}" + + results.append(run_test("Ray Core - Object Store", test_object_store)) + + def test_wait() -> str: + @ray.remote + def slow_identity(x: str, delay: float) -> str: + import time + + time.sleep(delay) + return x + + refs = [ + slow_identity.remote("fast", 0.2), + slow_identity.remote("slow", 1.0), + ] + + ready, remaining = ray.wait(refs, num_returns=1, timeout=5) + + assert len(ready) == 1 + assert len(remaining) == 1 + + first = ray.get(ready[0]) + assert first == "fast" + + ray.get(remaining) + + return "✓ 通过|| ray.wait 正常" + + results.append(run_test("Ray Core - ray.wait", test_wait)) + + def test_runtime_env() -> str: + @ray.remote(runtime_env={"env_vars": {"RAY_VERIFY_ENV": "OK"}}) + def read_env() -> str | None: + import os + + return os.environ.get("RAY_VERIFY_ENV") + + value = ray.get(read_env.remote()) + + assert value == "OK", f"runtime_env env_vars 未生效: {value}" + + return "✓ 通过|| runtime_env env_vars 正常" + + results.append(run_test("Ray Core - runtime_env", test_runtime_env)) + + def test_placement_group() -> str: + total_cpu = float(ray.cluster_resources().get("CPU", 0)) + + if total_cpu < 1: + raise SkipTest("集群 CPU 资源小于 1,跳过 placement group 测试") + + from ray.util.placement_group import placement_group, remove_placement_group + + pg = placement_group([{"CPU": 1}], strategy="PACK") + ray.get(pg.ready(), timeout=20) + + @ray.remote(num_cpus=1) + def pg_task() -> dict[str, Any]: + import os + + return { + "pid": os.getpid(), + "ok": True, + } + + try: + ref = pg_task.options(placement_group=pg).remote() + result = ray.get(ref) + finally: + remove_placement_group(pg) + + assert result["ok"] is True + + return f"✓ 通过|| placement group 正常,task pid={result['pid']}" + + results.append(run_test("Ray Core - Placement Group", test_placement_group)) + + # ------------------------------------------------------------------------- + # GPU + # ------------------------------------------------------------------------- + + def test_ray_gpu_scheduling() -> str: + gpu_count = float(ray.cluster_resources().get("GPU", 0)) + + if gpu_count <= 0: + if args.require_gpu: + raise RuntimeError("要求 GPU,但 Ray cluster_resources() 没有检测到 GPU") + raise SkipTest("Ray 未检测到 GPU,跳过 GPU 调度测试") + + @ray.remote(num_gpus=1) + def gpu_task() -> dict[str, Any]: + import os + import subprocess + + info: dict[str, Any] = { + "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES"), + "nvidia_smi": None, + "torch_cuda_available": None, + "torch_device_count": None, + "torch_device_name": None, + } + + try: + out = subprocess.check_output( + ["nvidia-smi"], + stderr=subprocess.STDOUT, + timeout=10, + ).decode("utf-8", errors="ignore") + info["nvidia_smi"] = out.splitlines()[0] if out else "EMPTY" + except Exception as exc: + info["nvidia_smi"] = f"nvidia-smi failed: {exc}" + + try: + import torch + + info["torch_cuda_available"] = torch.cuda.is_available() + info["torch_device_count"] = torch.cuda.device_count() + + if torch.cuda.is_available(): + info["torch_device_name"] = torch.cuda.get_device_name(0) + + except Exception as exc: + info["torch_cuda_available"] = f"torch unavailable: {exc}" + + return info + + info = ray.get(gpu_task.remote()) + + if not info.get("CUDA_VISIBLE_DEVICES"): + raise RuntimeError(f"Ray 分配了 GPU,但 CUDA_VISIBLE_DEVICES 为空: {info}") + + return f"✓ 通过|| Ray GPU 调度正常: {info}" + + results.append(run_test("GPU - Ray GPU Scheduling", test_ray_gpu_scheduling)) + + def test_driver_torch_cuda() -> str: + if not module_exists("torch"): + if args.require_gpu: + raise RuntimeError("要求 GPU,但未安装 torch,无法验证 PyTorch CUDA") + raise SkipTest("未安装 torch,跳过 PyTorch CUDA 测试") + + import torch + + if not torch.cuda.is_available(): + if args.require_gpu: + raise RuntimeError("要求 GPU,但 torch.cuda.is_available() = False") + raise SkipTest("torch 已安装,但当前 driver 进程未检测到 CUDA") + + return ( + f"PyTorch CUDA 正常,device_count={torch.cuda.device_count()}, " + f"device_name={torch.cuda.get_device_name(0)}" + ) + + results.append(run_test("GPU - PyTorch CUDA", test_driver_torch_cuda)) + + # ------------------------------------------------------------------------- + # Ray Data + # ------------------------------------------------------------------------- + + if not args.skip_data: + + def test_ray_data() -> str: + if not module_exists("ray.data"): + raise SkipTest("未安装 Ray Data,请安装 ray[data] 或 ray[all]") + + import ray.data + + ds = ray.data.from_items([{"x": i} for i in range(10)]) + mapped = ds.map(lambda row: {"x": row["x"], "y": row["x"] * 2}) + rows = mapped.take_all() + + total_y = sum(int(row["y"]) for row in rows) + + assert len(rows) == 10 + assert total_y == 90 + + return f"✓ 通过|| Ray Data 正常,rows={len(rows)}, sum_y={total_y}" + + results.append(run_test("Ray Data", test_ray_data)) + + # ------------------------------------------------------------------------- + # Ray Tune + # ------------------------------------------------------------------------- + + if not args.skip_tune: + + def test_ray_tune() -> str: + if not module_exists("ray.tune"): + raise SkipTest("未安装 Ray Tune,请安装 ray[tune] 或 ray[all]") + + from ray import tune + from ray.tune import RunConfig + + temp_dir = tempfile.mkdtemp(prefix="ray_verify_tune_") + + def trainable(config: dict[str, Any]) -> None: + # 在 Ray 2.5x 中,Tune function trainable 推荐使用 ray.train.report。 + from ray import tune + + score = config["x"] * 2 + tune.report({"score": score}) + + tuner = tune.Tuner( + trainable, + param_space={ + "x": tune.grid_search([1, 2, 3]), + }, + run_config=RunConfig( + name="ray_verify_tune", + storage_path=temp_dir + ), + ) + + result_grid = tuner.fit() + + if result_grid.errors: + raise RuntimeError(f"Tune trials 出现错误: {result_grid.errors}") + + best = result_grid.get_best_result(metric="score", mode="max") + best_score = best.metrics.get("score") + + assert best_score == 6, f"best_score 不符合预期: {best_score}" + + return f"✓ 通过|| Ray Tune 正常,best_score={best_score}, path={temp_dir}" + + results.append(run_test("Ray Tune", test_ray_tune)) + + # ------------------------------------------------------------------------- + # Ray Train + # ------------------------------------------------------------------------- + + if not args.skip_train: + + def test_ray_train_torch() -> str: + if not module_exists("ray.train"): + raise SkipTest("未安装 Ray Train,请安装 ray[train] 或 ray[all]") + + if not module_exists("torch"): + raise SkipTest("未安装 torch,跳过 TorchTrainer 测试") + + import torch + from ray.train import Checkpoint, RunConfig, ScalingConfig + from ray.train.torch import TorchTrainer + + total_cpu = int(float(ray.cluster_resources().get("CPU", 1))) + num_workers = max(1, min(args.train_workers, max(1, total_cpu))) + + use_gpu = bool(args.train_use_gpu) + + if use_gpu: + gpu_count = float(ray.cluster_resources().get("GPU", 0)) + + if gpu_count < num_workers: + raise RuntimeError( + f"Ray GPU 数量不足,要求 train_workers={num_workers}, " + f"实际 GPU={gpu_count}" + ) + + temp_dir = tempfile.mkdtemp(prefix="ray_verify_train_") + + def train_loop_per_worker(config: dict[str, Any] | None = None) -> None: + import os + import tempfile + import torch + + from ray import train + from ray.train import Checkpoint + + ctx = train.get_context() + world_rank = ctx.get_world_rank() + + x = torch.tensor([[0.0], [1.0], [2.0], [3.0]]) + y = 2.0 * x + 1.0 + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model = torch.nn.Linear(1, 1).to(device) + optimizer = torch.optim.SGD(model.parameters(), lr=0.05) + loss_fn = torch.nn.MSELoss() + + x = x.to(device) + y = y.to(device) + + last_loss = None + + for _ in range(50): + pred = model(x) + loss = loss_fn(pred, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + last_loss = float(loss.detach().cpu().item()) + + metrics = { + "loss": last_loss, + "device": str(device), + "world_rank": world_rank, + } + + # Ray Train V2 对 metrics 的持久化更依赖 checkpoint。 + # rank 0 保存 checkpoint,并把 loss 写进 checkpoint 文件,外部可稳定读取。 + if world_rank == 0: + with tempfile.TemporaryDirectory() as checkpoint_dir: + checkpoint_path = os.path.join(checkpoint_dir, "state.pt") + torch.save(metrics, checkpoint_path) + + train.report( + metrics, + checkpoint=Checkpoint.from_directory(checkpoint_dir), + ) + else: + train.report(metrics) + + trainer = TorchTrainer( + train_loop_per_worker=train_loop_per_worker, + scaling_config=ScalingConfig( + num_workers=num_workers, + use_gpu=use_gpu, + ), + run_config=RunConfig( + name="ray_verify_train", + storage_path=temp_dir, + ), + ) + + result = trainer.fit() + + metrics = getattr(result, "metrics", {}) or {} + loss = metrics.get("loss") + device = metrics.get("device") + + # 某些 Ray Train V2 场景下 result.metrics 可能为空; + # 因此从 checkpoint 中兜底读取 loss。 + if loss is None: + checkpoint = getattr(result, "checkpoint", None) + + if checkpoint is not None: + with checkpoint.as_directory() as checkpoint_dir: + checkpoint_path = Path(checkpoint_dir) / "state.pt" + + if checkpoint_path.exists(): + payload = torch.load( + checkpoint_path, + map_location="cpu", + weights_only=False, + ) + + if isinstance(payload, dict): + loss = payload.get("loss") + device = payload.get("device") + + if loss is None: + raise RuntimeError( + f"Train 结果中没有 loss,metrics={metrics}, " + f"checkpoint={getattr(result, 'checkpoint', None)}" + ) + + if float(loss) > 1.0: + raise RuntimeError(f"训练 loss 偏高,loss={loss}") + + return ( + f"✓ 通过|| Ray Train TorchTrainer 正常,workers={num_workers}, " + f"use_gpu={use_gpu}, device={device}, loss={float(loss):.6f}, " + f"path={temp_dir}" + ) + + results.append(run_test("Ray Train - TorchTrainer", test_ray_train_torch)) + + # ------------------------------------------------------------------------- + # Ray Serve + # ------------------------------------------------------------------------- + + if not args.skip_serve: + + def test_ray_serve() -> str: + if not module_exists("ray.serve"): + raise SkipTest("未安装 Ray Serve,请安装 ray[serve] 或 ray[all]") + + from ray import serve + + try: + serve.shutdown() + except Exception: + pass + + @serve.deployment(ray_actor_options={"num_cpus": 0}) + class EchoDeployment: + async def __call__(self, value: str = "hello") -> str: + return f"serve:{value}" + + handle = serve.run( + EchoDeployment.bind(), + name="ray_verify_serve_app", + route_prefix="/ray-verify", + ) + + # 新版 Ray Serve 的 handle.remote() 返回 DeploymentResponse, + # 不是普通 ObjectRef,因此不能 ray.get(handle.remote(...))。 + response = handle.remote("ok") + result = response.result(timeout_s=30) + + try: + serve.shutdown() + except Exception: + pass + + assert result == "serve:ok", f"Serve 返回不符合预期: {result}" + + return "✓ 通过|| Ray Serve 正常,DeploymentResponse.result() 调用成功" + + results.append(run_test("Ray Serve", test_ray_serve)) + + # ------------------------------------------------------------------------- + # RLlib + # ------------------------------------------------------------------------- + + if not args.skip_rllib: + + def test_rllib() -> str: + if not args.full: + raise SkipTest("RLlib 测试较重,使用 --full 开启") + + if not module_exists("ray.rllib"): + raise SkipTest("未安装 RLlib,请安装 ray[rllib] 或 ray[all]") + + if not module_exists("gymnasium"): + raise SkipTest("未安装 gymnasium,RLlib CartPole 测试跳过") + + if not module_exists("torch"): + raise SkipTest("未安装 torch,RLlib PPO torch 测试跳过") + + from ray.rllib.algorithms.ppo import PPOConfig + + config = PPOConfig() + config = config.environment("CartPole-v1") + + if hasattr(config, "framework"): + config = config.framework("torch") + + # Ray 2.55 默认使用新 API stack。 + # 新版本用 env_runners;旧版本用 rollouts。 + if hasattr(config, "env_runners"): + config = config.env_runners(num_env_runners=0) + else: + config = config.rollouts(num_rollout_workers=0) + + # 兼容新旧训练参数命名。 + try: + config = config.training( + train_batch_size_per_learner=64, + minibatch_size=32, + num_epochs=1, + lr=1e-3, + ) + except TypeError: + config = config.training( + train_batch_size=64, + sgd_minibatch_size=32, + num_sgd_iter=1, + lr=1e-3, + ) + + if hasattr(config, "build_algo"): + algo = config.build_algo() + else: + algo = config.build() + + try: + train_result = algo.train() + finally: + algo.stop() + + episode_reward_mean = train_result.get("episode_reward_mean") + + return ( + "✓ 通过|| RLlib PPO 正常完成一次训练," + f"episode_reward_mean={episode_reward_mean}" + ) + + results.append(run_test("RLlib", test_rllib)) + + # ------------------------------------------------------------------------- + # Summary + # ------------------------------------------------------------------------- + + print("\n" + "=" * 80) + print("验证结果汇总") + print("=" * 80) + + status_counts = { + "PASS": sum(1 for r in results if r.status == "PASS"), + "SKIP": sum(1 for r in results if r.status == "SKIP"), + "FAIL": sum(1 for r in results if r.status == "FAIL"), + } + + for r in results: + print(f"{r.status:4} | {r.seconds:7.2f}s | {r.name} | {r.message}") + + print_json("Status counts:", status_counts) + + try: + ray.shutdown() + except Exception: + pass + + if status_counts["FAIL"] > 0: + print("\n✗ 失败|| 最终结果:FAILED") + sys.exit(1) + + print("\n✓ 通过|| 最终结果:PASSED") + sys.exit(0) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/frameworks/Ray/2.54.0/test_result.png b/frameworks/Ray/2.54.0/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..79d62ee63e66dc5feee9dab21ce147449c48f422 Binary files /dev/null and b/frameworks/Ray/2.54.0/test_result.png differ diff --git a/frameworks/Ray/2.54.1/Dockerfile b/frameworks/Ray/2.54.1/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..d9405b0f93e7b3908cf9b45acf47f830205f50b8 --- /dev/null +++ b/frameworks/Ray/2.54.1/Dockerfile @@ -0,0 +1,92 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="Ray all components + Torch GPU on OpenCloudOS 9 CUDA 12.8" + +# ========================= +# 版本参数 +# ========================= +ARG RAY_VERSION=2.54.1 +ARG TORCH_VERSION=2.11.0 +ARG TORCHVISION_VERSION=0.26.0 +ARG TORCHAUDIO_VERSION=2.11.0 +ARG PYTORCH_INDEX_URL=https://download.pytorch.org/whl/cu128 + +# ========================= +# 基础环境变量 +# ========================= +ENV PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility \ + RAY_DISABLE_USAGE_STATS=1 + +# ========================= +# Ray 集群默认环境变量 +# 这些变量会被 start-ray.sh 使用 +# ========================= +ENV RAY_NODE_TYPE=single \ + RAY_HEAD_ADDRESS="" \ + RAY_NODE_IP_ADDRESS="" \ + RAY_HEAD_PORT=6379 \ + RAY_DASHBOARD_HOST=0.0.0.0 \ + RAY_DASHBOARD_PORT=8265 \ + RAY_CLIENT_SERVER_PORT=10001 \ + RAY_SERVE_HTTP_PORT=8000 \ + RAY_NODE_MANAGER_PORT=8076 \ + RAY_OBJECT_MANAGER_PORT=8077 \ + RAY_RUNTIME_ENV_AGENT_PORT=8078 \ + RAY_DASHBOARD_AGENT_GRPC_PORT=8079 \ + RAY_DASHBOARD_AGENT_LISTEN_PORT=8080 \ + RAY_METRICS_EXPORT_PORT=8081 \ + RAY_MIN_WORKER_PORT=10002 \ + RAY_MAX_WORKER_PORT=10100 \ + RAY_TEMP_DIR=/tmp/ray \ + RAY_NUM_CPUS="" \ + RAY_NUM_GPUS="" \ + RAY_OBJECT_STORE_MEMORY="" \ + RAY_RESOURCES="" + +WORKDIR /home + +# ========================= +# 安装 Ray 全组件 +# ========================= +RUN python3 -m pip install \ + "ray[all,client,serve-grpc]==${RAY_VERSION}" + +# ========================= +# 安装 PyTorch GPU 版本 +# 注意:如果 torch==2.11.0 / torchvision==0.26.0 当前源里不存在,构建会失败。 +# 可以通过 docker build --build-arg 修改版本。 +# ========================= +RUN python3 -m pip install \ + torch==${TORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url ${PYTORCH_INDEX_URL} + +# ========================= +# 拷贝测试脚本和启动脚本 +# ========================= +COPY ./test_ray.py /home/test_ray.py +COPY ./start-ray.sh /usr/local/bin/start-ray.sh + +RUN chmod +x /usr/local/bin/start-ray.sh + + +# ========================= +# Ray 常用端口 +# 6379 : Ray Head / GCS +# 8265 : Ray Dashboard +# 10001 : Ray Client +# 8000 : Ray Serve HTTP +# 8076-8081 : 固定 Ray 内部组件端口 +# 10002-10100 : Ray Worker 端口范围 +# ========================= +EXPOSE 6379 8265 10001 8000 8076 8077 8078 8079 8080 8081 10002-10100 + +#ENTRYPOINT ["/usr/local/bin/start-ray.sh"] 在正式merge后,需要将这行注释打开 +CMD ["/usr/local/bin/start-ray.sh"] \ No newline at end of file diff --git a/frameworks/Ray/2.54.1/README.md b/frameworks/Ray/2.54.1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7bbf644328448c9dad1894f1060e581dedf7e146 --- /dev/null +++ b/frameworks/Ray/2.54.1/README.md @@ -0,0 +1,409 @@ +# Ray 2.55.1 + Torch 2.11.0 on OpenCloudOS 9 + +## 基本信息 + +- **Ray 版本**:v2.54.1 +- **Torch 版本**:v2.11.0 +- **TorchVision 版本**:v0.26.0 +- **TorchAudio 版本**:v2.11.0 +- **基础镜像**:opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**:12.8 +- **GPU 支持**:NVIDIA GPU / CUDA +- **Ray 安装组件**:ray[all,client,serve-grpc] + +## 适用场景 + +- Ray 单机任务调度 +- Ray 多机集群任务调度 +- Ray Data 数据处理 +- Ray Train 分布式训练 +- Ray Tune 参数调优 +- Ray Serve 模型服务 +- Ray RLlib 强化学习 +- PyTorch GPU 训练 / 推理 + +--- + +## 构建 + +```bash +docker build -t oc9-ray:2.54.1 . +``` + +也可以通过构建参数指定 Ray 和 Torch 版本: + +```bash +docker build \ + --build-arg RAY_VERSION=2.54.1 \ + --build-arg TORCH_VERSION=2.11.0 \ + --build-arg TORCHVISION_VERSION=0.26.0 \ + --build-arg TORCHAUDIO_VERSION=2.11.0 \ + -t oc9-ray:2.54.1 . +``` + +--- + +## 镜像启动命令 + +### 单机模式启动 + +单机模式会在当前容器内启动一个 Ray Head 节点,适合本地开发、单机测试、单机 GPU 任务。 + +```bash +docker run -d \ + --gpus all \ + --shm-size=8g \ + --name oc9-ray-single \ + -e RAY_NODE_TYPE=single \ + -p 6379:6379 \ + -p 8265:8265 \ + -p 10001:10001 \ + -p 8000:8000 \ + oc9-ray:2.54.1 +``` + +查看 Ray Dashboard: + +```text +http://宿主机IP:8265 +``` + +进入容器: + +```bash +docker exec -it oc9-ray-single bash +``` + +查看 Ray 集群状态: + +```bash +ray status +``` + +--- + +### 多机集群模式启动 + +Ray 多机集群由一个 Head 节点和多个 Worker 节点组成。 + +推荐使用 `--network=host`,避免 Docker bridge 网络导致 Ray 节点之间无法互相访问。 + +假设机器 IP 如下: + +| 角色 | IP | +|---|---| +| Head 节点 | 192.168.1.10 | +| Worker 节点 1 | 192.168.1.11 | +| Worker 节点 2 | 192.168.1.12 | + +--- + +#### 启动 Head 节点 + +在 `192.168.1.10` 上执行: + +```bash +docker run -d \ + --gpus all \ + --network=host \ + --shm-size=8g \ + --name oc9-ray-head \ + -e RAY_NODE_TYPE=head \ + -e RAY_NODE_IP_ADDRESS=192.168.1.10 \ + -e RAY_NUM_GPUS=1 \ + oc9-ray:2.54.1 +``` + +查看 Head 节点日志: + +```bash +docker logs -f oc9-ray-head +``` + +查看 Dashboard: + +```text +http://192.168.1.10:8265 +``` + +--- + +#### 启动 Worker 节点 + +在 `192.168.1.11` 上执行: + +```bash +docker run -d \ + --gpus all \ + --network=host \ + --shm-size=8g \ + --name oc9-ray-worker-1 \ + -e RAY_NODE_TYPE=worker \ + -e RAY_HEAD_ADDRESS=192.168.1.10:6379 \ + -e RAY_NODE_IP_ADDRESS=192.168.1.11 \ + -e RAY_NUM_GPUS=1 \ + oc9-ray:2.54.1 +``` + +在 `192.168.1.12` 上执行: + +```bash +docker run -d \ + --gpus all \ + --network=host \ + --shm-size=8g \ + --name oc9-ray-worker-2 \ + -e RAY_NODE_TYPE=worker \ + -e RAY_HEAD_ADDRESS=192.168.1.10:6379 \ + -e RAY_NODE_IP_ADDRESS=192.168.1.12 \ + -e RAY_NUM_GPUS=1 \ + oc9-ray:2.54.1 +``` + +查看 Worker 节点日志: + +```bash +docker logs -f oc9-ray-worker-1 +docker logs -f oc9-ray-worker-2 +``` + +--- + +## 镜像测试命令 + +### 单机完整测试 + +```bash +docker run --rm \ + --gpus all \ + --shm-size=8g \ + -e RAY_NODE_TYPE=test \ + oc9-ray:2.54.1 \ + python3 test_ray.py --full --require-gpu +``` + +### 在已有 Head 节点中测试 + +```bash +docker exec -it oc9-ray-head bash +python3 /home/test_ray.py --address auto --full --require-gpu +``` + +### 通过 Ray Client 连接测试 + +```bash +python3 test_ray.py --address ray://192.168.1.10:10001 --full --require-gpu +``` + +--- + +## 常用端口 + +| 端口 | 说明 | +|---:|---| +| 6379 | Ray Head / GCS 端口 | +| 8265 | Ray Dashboard 端口 | +| 10001 | Ray Client 端口 | +| 8000 | Ray Serve HTTP 端口 | +| 8076 | Ray Node Manager 端口 | +| 8077 | Ray Object Manager 端口 | +| 8078 | Ray Runtime Env Agent 端口 | +| 8079 | Ray Dashboard Agent gRPC 端口 | +| 8080 | Ray Dashboard Agent HTTP 端口 | +| 8081 | Ray Metrics Export 端口 | +| 10002-10100 | Ray Worker 进程端口范围 | + +--- + +## 环境变量说明 + +| 环境变量 | 默认值 | 示例 | 说明 | +|---|---|---|---| +| `RAY_NODE_TYPE` | `single` | `head` / `worker` / `single` / `test` | 容器启动模式。`head` 表示 Head 节点,`worker` 表示 Worker 节点,`single` 表示单机模式,`test` 表示执行测试脚本 | +| `RAY_HEAD_ADDRESS` | 空 | `192.168.1.10:6379` | Worker 节点连接 Head 节点的地址。`RAY_NODE_TYPE=worker` 时必填 | +| `RAY_NODE_IP_ADDRESS` | 自动获取 | `192.168.1.11` | 当前节点对其他 Ray 节点可访问的 IP。多机部署时建议显式指定 | +| `RAY_HEAD_PORT` | `6379` | `6379` | Ray Head / GCS 监听端口 | +| `RAY_DASHBOARD_HOST` | `0.0.0.0` | `0.0.0.0` | Ray Dashboard 监听地址。容器中建议设置为 `0.0.0.0` | +| `RAY_DASHBOARD_PORT` | `8265` | `8265` | Ray Dashboard 端口 | +| `RAY_CLIENT_SERVER_PORT` | `10001` | `10001` | Ray Client 连接端口 | +| `RAY_SERVE_HTTP_PORT` | `8000` | `8000` | Ray Serve HTTP 服务端口 | +| `RAY_NODE_MANAGER_PORT` | `8076` | `8076` | Ray Node Manager 固定端口 | +| `RAY_OBJECT_MANAGER_PORT` | `8077` | `8077` | Ray Object Manager 固定端口 | +| `RAY_RUNTIME_ENV_AGENT_PORT` | `8078` | `8078` | Ray Runtime Env Agent 固定端口 | +| `RAY_DASHBOARD_AGENT_GRPC_PORT` | `8079` | `8079` | Ray Dashboard Agent gRPC 端口 | +| `RAY_DASHBOARD_AGENT_LISTEN_PORT` | `8080` | `8080` | Ray Dashboard Agent HTTP 端口 | +| `RAY_METRICS_EXPORT_PORT` | `8081` | `8081` | Ray Metrics 指标暴露端口 | +| `RAY_MIN_WORKER_PORT` | `10002` | `10002` | Ray Worker 进程端口范围下限 | +| `RAY_MAX_WORKER_PORT` | `10100` | `10100` | Ray Worker 进程端口范围上限 | +| `RAY_NUM_CPUS` | 自动检测 | `20` | 手动指定当前节点可用 CPU 数量 | +| `RAY_NUM_GPUS` | 自动检测 | `1` | 手动指定当前节点可用 GPU 数量 | +| `RAY_OBJECT_STORE_MEMORY` | 自动计算 | `8589934592` | Ray Object Store 内存大小,单位为 bytes | +| `RAY_RESOURCES` | 空 | `'{"worker": 1}'` | 自定义 Ray 资源标签 | +| `RAY_TEMP_DIR` | `/tmp/ray` | `/tmp/ray` | Ray 临时文件目录 | +| `RAY_DISABLE_USAGE_STATS` | `1` | `1` | 禁用 Ray 使用统计上报 | +| `NVIDIA_VISIBLE_DEVICES` | `all` | `all` / `0` / `0,1` | 指定容器可见 GPU | +| `NVIDIA_DRIVER_CAPABILITIES` | `compute,utility` | `compute,utility` | NVIDIA 容器运行能力,GPU 计算通常需要 `compute`,`nvidia-smi` 需要 `utility` | + +--- + +## 多机协同任务测试 + +进入 Head 容器: + +```bash +docker exec -it oc9-ray-head bash +``` + +执行: + +```bash +python3 - <<'PY' +import socket +import ray + +ray.init(address="auto") + +@ray.remote +def task(i): + return { + "task": i, + "host": socket.gethostname(), + "node_id": ray.get_runtime_context().get_node_id(), + } + +refs = [task.remote(i) for i in range(50)] +results = ray.get(refs) + +for item in results[:20]: + print(item) + +print("cluster_resources:", ray.cluster_resources()) +PY +``` + +如果 Worker 节点加入成功,`cluster_resources` 中会显示多台节点的 CPU / GPU 资源。 + +--- + +## GPU 验证 + +```bash +docker run --rm \ + --gpus all \ + --shm-size=8g \ + oc9-ray:2.55.1 \ + bash -c "nvidia-smi && python3 -c 'import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))'" +``` + +预期输出中应包含: + +```text +True +NVIDIA ... +``` + +--- + +## 版本检查 + +进入容器后执行: + +```bash +python3 - <<'PY' +import ray +import torch + +print("Ray:", ray.__version__) +print("Torch:", torch.__version__) +print("Torch CUDA:", torch.version.cuda) +print("CUDA available:", torch.cuda.is_available()) + +if torch.cuda.is_available(): + print("GPU:", torch.cuda.get_device_name(0)) +PY +``` + +--- + +## 注意事项 + +1. Docker 默认 `/dev/shm` 只有 64MB,Ray Object Store 会受到影响,建议启动容器时增加: + +```bash +--shm-size=8g +``` + +2. 多机部署建议使用: + +```bash +--network=host +``` + +3. Head 和 Worker 节点的 Ray 版本、Python 版本、CUDA 版本、PyTorch 版本应保持一致。 + +4. Worker 节点必须能访问 Head 节点的 `6379` 端口。 + +5. 多机 Docker bridge 网络模式下,Ray 可能识别到容器内网 IP,导致其他机器无法访问,因此生产部署建议显式设置: + +```bash +-e RAY_NODE_IP_ADDRESS=当前宿主机IP +``` + +6. 生产环境建议固定 Ray 内部端口和 Worker 端口范围,便于配置防火墙、安全组和网络策略。 + +7. 使用 GPU 时,宿主机必须安装 NVIDIA Driver 和 NVIDIA Container Toolkit。 + +8. 如果 Ray Dashboard 无法访问,请确认启动参数中包含: + +```bash +-e RAY_DASHBOARD_HOST=0.0.0.0 +``` + +9. 如果 Worker 节点无法加入集群,请优先检查: + - Head 节点 IP 是否正确 + - `RAY_HEAD_ADDRESS` 是否正确 + - 防火墙是否放通 `6379` + - 是否使用了 `--network=host` + - `RAY_NODE_IP_ADDRESS` 是否设置为宿主机可访问 IP + +--- + +## 常见命令 + +查看 Ray 状态: + +```bash +ray status +``` + +查看 Ray 任务: + +```bash +ray list tasks +``` + +查看 Ray 节点: + +```bash +ray list nodes +``` + +停止 Ray: + +```bash +ray stop --force +``` + +查看容器日志: + +```bash +docker logs -f oc9-ray-head +docker logs -f oc9-ray-worker-1 +``` + +删除容器: + +```bash +docker rm -f oc9-ray-head oc9-ray-worker-1 oc9-ray-worker-2 +``` \ No newline at end of file diff --git a/frameworks/Ray/2.54.1/build.conf b/frameworks/Ray/2.54.1/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..1e89c00e63a3f3e05097cdfaeb611ab0e0e0ec44 --- /dev/null +++ b/frameworks/Ray/2.54.1/build.conf @@ -0,0 +1,4 @@ +# Ray 2.54.1 + PyTorch 2.11.0 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-ray +IMAGE_TAG=2.54.1 +GPU_TEST=true \ No newline at end of file diff --git a/frameworks/Ray/2.54.1/start-ray.sh b/frameworks/Ray/2.54.1/start-ray.sh new file mode 100644 index 0000000000000000000000000000000000000000..fe9d7a1750dda9f6fd6655ad7962ea36d7d166b7 --- /dev/null +++ b/frameworks/Ray/2.54.1/start-ray.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "============================================================" +echo "Ray Container Startup" +echo "============================================================" +echo "RAY_NODE_TYPE=${RAY_NODE_TYPE:-single}" +echo "RAY_HEAD_ADDRESS=${RAY_HEAD_ADDRESS:-}" +echo "RAY_NODE_IP_ADDRESS=${RAY_NODE_IP_ADDRESS:-}" +echo "RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}" +echo "RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}" +echo "RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}" +echo "RAY_MIN_WORKER_PORT=${RAY_MIN_WORKER_PORT:-10002}" +echo "RAY_MAX_WORKER_PORT=${RAY_MAX_WORKER_PORT:-10100}" +echo "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-}" +echo "============================================================" + +get_node_ip() { + if [[ -n "${RAY_NODE_IP_ADDRESS:-}" ]]; then + echo "${RAY_NODE_IP_ADDRESS}" + else + hostname -I | awk '{print $1}' + fi +} + +build_common_ray_args() { + local args=() + + args+=("--node-ip-address=$(get_node_ip)") + args+=("--node-manager-port=${RAY_NODE_MANAGER_PORT:-8076}") + args+=("--object-manager-port=${RAY_OBJECT_MANAGER_PORT:-8077}") + args+=("--runtime-env-agent-port=${RAY_RUNTIME_ENV_AGENT_PORT:-8078}") + args+=("--dashboard-agent-grpc-port=${RAY_DASHBOARD_AGENT_GRPC_PORT:-8079}") + args+=("--dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_LISTEN_PORT:-8080}") + args+=("--metrics-export-port=${RAY_METRICS_EXPORT_PORT:-8081}") + args+=("--min-worker-port=${RAY_MIN_WORKER_PORT:-10002}") + args+=("--max-worker-port=${RAY_MAX_WORKER_PORT:-10100}") + args+=("--temp-dir=${RAY_TEMP_DIR:-/tmp/ray}") + + if [[ -n "${RAY_NUM_CPUS:-}" ]]; then + args+=("--num-cpus=${RAY_NUM_CPUS}") + fi + + if [[ -n "${RAY_NUM_GPUS:-}" ]]; then + args+=("--num-gpus=${RAY_NUM_GPUS}") + fi + + if [[ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]]; then + args+=("--object-store-memory=${RAY_OBJECT_STORE_MEMORY}") + fi + + if [[ -n "${RAY_RESOURCES:-}" ]]; then + args+=("--resources=${RAY_RESOURCES}") + fi + + printf '%s\n' "${args[@]}" +} + +start_head() { + echo "[INFO] Starting Ray HEAD node..." + + ray stop --force || true + + mapfile -t COMMON_ARGS < <(build_common_ray_args) + + exec ray start \ + --head \ + --port="${RAY_HEAD_PORT:-6379}" \ + --dashboard-host="${RAY_DASHBOARD_HOST:-0.0.0.0}" \ + --dashboard-port="${RAY_DASHBOARD_PORT:-8265}" \ + --ray-client-server-port="${RAY_CLIENT_SERVER_PORT:-10001}" \ + "${COMMON_ARGS[@]}" \ + --block +} + +start_worker() { + echo "[INFO] Starting Ray WORKER node..." + + if [[ -z "${RAY_HEAD_ADDRESS:-}" ]]; then + echo "[ERROR] RAY_HEAD_ADDRESS is required for worker node." + echo "Example: RAY_HEAD_ADDRESS=192.168.1.10:6379" + exit 1 + fi + + ray stop --force || true + + mapfile -t COMMON_ARGS < <(build_common_ray_args) + + exec ray start \ + --address="${RAY_HEAD_ADDRESS}" \ + "${COMMON_ARGS[@]}" \ + --block +} + +start_single() { + echo "[INFO] Starting Ray SINGLE node..." + + ray stop --force || true + + mapfile -t COMMON_ARGS < <(build_common_ray_args) + + exec ray start \ + --head \ + --port="${RAY_HEAD_PORT:-6379}" \ + --dashboard-host="${RAY_DASHBOARD_HOST:-0.0.0.0}" \ + --dashboard-port="${RAY_DASHBOARD_PORT:-8265}" \ + --ray-client-server-port="${RAY_CLIENT_SERVER_PORT:-10001}" \ + "${COMMON_ARGS[@]}" \ + --block +} + +run_test() { + echo "[INFO] Running Ray test script..." + python3 /home/test_ray.py "$@" +} + +case "${RAY_NODE_TYPE:-single}" in + head) + start_head + ;; + + worker) + start_worker + ;; + + single) + start_single + ;; + + test) + shift || true + run_test "$@" + ;; + + bash|shell) + exec /bin/bash + ;; + + *) + echo "[INFO] Executing custom command: $*" + exec "$@" + ;; +esac \ No newline at end of file diff --git a/frameworks/Ray/2.54.1/test.sh b/frameworks/Ray/2.54.1/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..00bb47c4995c855b1a5aeaf9aa7103954cbeb971 --- /dev/null +++ b/frameworks/Ray/2.54.1/test.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +IMAGE="${1:-}" + +if [ -z "${IMAGE}" ]; then + echo "用法: bash test.sh <镜像名:标签>" + exit 1 +fi + +if ! command -v docker >/dev/null 2>&1; then + echo "✗ 未找到 docker" + exit 1 +fi + +echo "=== Ray 2.54.1 容器基础功能测试 ===" + +echo "测试镜像: ${IMAGE}" + +docker run --rm --gpus all --entrypoint python3 "${IMAGE}" /home/test_ray.py --full --require-gpu \ No newline at end of file diff --git a/frameworks/Ray/2.54.1/test_ray.py b/frameworks/Ray/2.54.1/test_ray.py new file mode 100644 index 0000000000000000000000000000000000000000..7054a443e1332f5806d0e8025fd3cca6fd04d198 --- /dev/null +++ b/frameworks/Ray/2.54.1/test_ray.py @@ -0,0 +1,784 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +verify_ray_full_fixed.py + +Ray 全组件验证脚本,适配 Ray 2.55.x。 + +验证内容: +- Ray Core: init / remote task / actor / object store / wait / runtime_env / placement group +- GPU: Ray GPU scheduling / CUDA_VISIBLE_DEVICES / nvidia-smi / PyTorch CUDA +- Ray Data +- Ray Tune +- Ray Train TorchTrainer +- Ray Serve +- RLlib,可用 --full 开启 + +常用运行方式: + python verify_ray_full_fixed.py + + python verify_ray_full_fixed.py --full + + python verify_ray_full_fixed.py --require-gpu + + python verify_ray_full_fixed.py --address auto + + python verify_ray_full_fixed.py --address ray://127.0.0.1:10001 + +Docker GPU 示例: + docker run --rm -it \ + --gpus all \ + --shm-size=8g \ + your-ray-image \ + python /app/verify_ray_full_fixed.py --full --require-gpu +""" + +from __future__ import annotations + +import argparse +import importlib.util +import json +import os +import socket +import subprocess +import sys +import tempfile +import time +import traceback +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Any + + +# Ray Train V2 在新版本 Ray 中是默认方向。 +# 这里显式设置,方便结果稳定,也避免部分迁移提示。 +os.environ.setdefault("RAY_TRAIN_V2_ENABLED", "1") + + +class SkipTest(Exception): + """表示当前环境不满足该组件测试条件,测试跳过。""" + + +@dataclass +class TestResult: + name: str + status: str + message: str + seconds: float + + +def module_exists(name: str) -> bool: + return importlib.util.find_spec(name) is not None + + +def print_json(title: str, data: Any) -> None: + print(f"\n{title}") + print(json.dumps(data, indent=2, ensure_ascii=False, default=str)) + + +def run_test(name: str, func: Callable[[], str]) -> TestResult: + start = time.time() + + try: + msg = func() + seconds = time.time() - start + print(f"[PASS] {name} - {msg}") + return TestResult(name, "PASS", msg, seconds) + + except SkipTest as exc: + seconds = time.time() - start + print(f"[SKIP] {name} - {exc}") + return TestResult(name, "SKIP", str(exc), seconds) + + except Exception as exc: + seconds = time.time() - start + print(f"[FAIL] {name} - {exc}") + traceback.print_exc() + return TestResult(name, "FAIL", str(exc), seconds) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Verify Ray full installation and runtime.") + + parser.add_argument( + "--address", + default=os.environ.get("RAY_ADDRESS"), + help=( + "Ray 地址。留空表示本地启动;" + "auto 表示连接已有本地 Ray 集群;" + "ray://host:10001 表示 Ray Client。" + ), + ) + + parser.add_argument( + "--full", + action="store_true", + help="开启更重的测试,例如 RLlib PPO。", + ) + + parser.add_argument( + "--require-gpu", + action="store_true", + help="要求 Ray 和 PyTorch 必须检测到 GPU,否则 GPU 测试失败。", + ) + + parser.add_argument( + "--train-workers", + type=int, + default=1, + help="Ray Train 使用的 worker 数量,默认 1。", + ) + + parser.add_argument( + "--train-use-gpu", + action="store_true", + help="Ray Train 测试是否使用 GPU。需要 Ray 检测到足够 GPU。", + ) + + parser.add_argument( + "--skip-data", + action="store_true", + help="跳过 Ray Data 测试。", + ) + + parser.add_argument( + "--skip-tune", + action="store_true", + help="跳过 Ray Tune 测试。", + ) + + parser.add_argument( + "--skip-train", + action="store_true", + help="跳过 Ray Train 测试。", + ) + + parser.add_argument( + "--skip-serve", + action="store_true", + help="跳过 Ray Serve 测试。", + ) + + parser.add_argument( + "--skip-rllib", + action="store_true", + help="跳过 RLlib 测试。", + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + print("=" * 80) + print("Ray Full Verification - Fixed for Ray 2.55.x") + print("=" * 80) + print(f"Python: {sys.version}") + print(f"Host: {socket.gethostname()}") + print(f"PID: {os.getpid()}") + print(f"RAY_ADDRESS: {args.address or ''}") + print(f"RAY_TRAIN_V2_ENABLED: {os.environ.get('RAY_TRAIN_V2_ENABLED')}") + + if not module_exists("ray"): + print("\n[ERROR] 未安装 Ray。可执行:") + print(' pip install -U "ray[all]"') + sys.exit(2) + + import ray + + print(f"Ray version: {ray.__version__}") + + print("\n初始化 Ray ...") + + if args.address: + ray_info = ray.init(address=args.address) + else: + ray_info = ray.init() + + print(f"Ray initialized: {ray.is_initialized()}") + + try: + dashboard_url = getattr(ray_info, "dashboard_url", None) + if dashboard_url: + print(f"Dashboard URL: {dashboard_url}") + except Exception: + pass + + print_json("Cluster resources:", ray.cluster_resources()) + print_json("Available resources:", ray.available_resources()) + + results: list[TestResult] = [] + + # ------------------------------------------------------------------------- + # Ray Core + # ------------------------------------------------------------------------- + + def test_core_task() -> str: + @ray.remote + def square(x: int) -> int: + return x * x + + refs = [square.remote(i) for i in range(10)] + values = ray.get(refs) + expected = [i * i for i in range(10)] + + assert values == expected, f"✗ 失败||结果不符合预期: {values}" + + return f"✓ 通过||remote task 正常,结果={values}" + + results.append(run_test("Ray Core - Remote Task", test_core_task)) + + def test_actor() -> str: + @ray.remote + class Counter: + def __init__(self) -> None: + self.value = 0 + + def inc(self, n: int = 1) -> int: + self.value += n + return self.value + + def get(self) -> int: + return self.value + + counter = Counter.remote() + + assert ray.get(counter.inc.remote()) == 1 + assert ray.get(counter.inc.remote(5)) == 6 + assert ray.get(counter.get.remote()) == 6 + + return "✓ 通过||actor 状态保持正常" + + results.append(run_test("Ray Core - Actor", test_actor)) + + def test_object_store() -> str: + payload = { + "message": "hello ray object store", + "numbers": list(range(1000)), + } + + ref = ray.put(payload) + got = ray.get(ref) + + assert got == payload + + return f"✓ 通过|| ray.put/ray.get 正常,numbers={len(got['numbers'])}" + + results.append(run_test("Ray Core - Object Store", test_object_store)) + + def test_wait() -> str: + @ray.remote + def slow_identity(x: str, delay: float) -> str: + import time + + time.sleep(delay) + return x + + refs = [ + slow_identity.remote("fast", 0.2), + slow_identity.remote("slow", 1.0), + ] + + ready, remaining = ray.wait(refs, num_returns=1, timeout=5) + + assert len(ready) == 1 + assert len(remaining) == 1 + + first = ray.get(ready[0]) + assert first == "fast" + + ray.get(remaining) + + return "✓ 通过|| ray.wait 正常" + + results.append(run_test("Ray Core - ray.wait", test_wait)) + + def test_runtime_env() -> str: + @ray.remote(runtime_env={"env_vars": {"RAY_VERIFY_ENV": "OK"}}) + def read_env() -> str | None: + import os + + return os.environ.get("RAY_VERIFY_ENV") + + value = ray.get(read_env.remote()) + + assert value == "OK", f"runtime_env env_vars 未生效: {value}" + + return "✓ 通过|| runtime_env env_vars 正常" + + results.append(run_test("Ray Core - runtime_env", test_runtime_env)) + + def test_placement_group() -> str: + total_cpu = float(ray.cluster_resources().get("CPU", 0)) + + if total_cpu < 1: + raise SkipTest("集群 CPU 资源小于 1,跳过 placement group 测试") + + from ray.util.placement_group import placement_group, remove_placement_group + + pg = placement_group([{"CPU": 1}], strategy="PACK") + ray.get(pg.ready(), timeout=20) + + @ray.remote(num_cpus=1) + def pg_task() -> dict[str, Any]: + import os + + return { + "pid": os.getpid(), + "ok": True, + } + + try: + ref = pg_task.options(placement_group=pg).remote() + result = ray.get(ref) + finally: + remove_placement_group(pg) + + assert result["ok"] is True + + return f"✓ 通过|| placement group 正常,task pid={result['pid']}" + + results.append(run_test("Ray Core - Placement Group", test_placement_group)) + + # ------------------------------------------------------------------------- + # GPU + # ------------------------------------------------------------------------- + + def test_ray_gpu_scheduling() -> str: + gpu_count = float(ray.cluster_resources().get("GPU", 0)) + + if gpu_count <= 0: + if args.require_gpu: + raise RuntimeError("要求 GPU,但 Ray cluster_resources() 没有检测到 GPU") + raise SkipTest("Ray 未检测到 GPU,跳过 GPU 调度测试") + + @ray.remote(num_gpus=1) + def gpu_task() -> dict[str, Any]: + import os + import subprocess + + info: dict[str, Any] = { + "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES"), + "nvidia_smi": None, + "torch_cuda_available": None, + "torch_device_count": None, + "torch_device_name": None, + } + + try: + out = subprocess.check_output( + ["nvidia-smi"], + stderr=subprocess.STDOUT, + timeout=10, + ).decode("utf-8", errors="ignore") + info["nvidia_smi"] = out.splitlines()[0] if out else "EMPTY" + except Exception as exc: + info["nvidia_smi"] = f"nvidia-smi failed: {exc}" + + try: + import torch + + info["torch_cuda_available"] = torch.cuda.is_available() + info["torch_device_count"] = torch.cuda.device_count() + + if torch.cuda.is_available(): + info["torch_device_name"] = torch.cuda.get_device_name(0) + + except Exception as exc: + info["torch_cuda_available"] = f"torch unavailable: {exc}" + + return info + + info = ray.get(gpu_task.remote()) + + if not info.get("CUDA_VISIBLE_DEVICES"): + raise RuntimeError(f"Ray 分配了 GPU,但 CUDA_VISIBLE_DEVICES 为空: {info}") + + return f"✓ 通过|| Ray GPU 调度正常: {info}" + + results.append(run_test("GPU - Ray GPU Scheduling", test_ray_gpu_scheduling)) + + def test_driver_torch_cuda() -> str: + if not module_exists("torch"): + if args.require_gpu: + raise RuntimeError("要求 GPU,但未安装 torch,无法验证 PyTorch CUDA") + raise SkipTest("未安装 torch,跳过 PyTorch CUDA 测试") + + import torch + + if not torch.cuda.is_available(): + if args.require_gpu: + raise RuntimeError("要求 GPU,但 torch.cuda.is_available() = False") + raise SkipTest("torch 已安装,但当前 driver 进程未检测到 CUDA") + + return ( + f"PyTorch CUDA 正常,device_count={torch.cuda.device_count()}, " + f"device_name={torch.cuda.get_device_name(0)}" + ) + + results.append(run_test("GPU - PyTorch CUDA", test_driver_torch_cuda)) + + # ------------------------------------------------------------------------- + # Ray Data + # ------------------------------------------------------------------------- + + if not args.skip_data: + + def test_ray_data() -> str: + if not module_exists("ray.data"): + raise SkipTest("未安装 Ray Data,请安装 ray[data] 或 ray[all]") + + import ray.data + + ds = ray.data.from_items([{"x": i} for i in range(10)]) + mapped = ds.map(lambda row: {"x": row["x"], "y": row["x"] * 2}) + rows = mapped.take_all() + + total_y = sum(int(row["y"]) for row in rows) + + assert len(rows) == 10 + assert total_y == 90 + + return f"✓ 通过|| Ray Data 正常,rows={len(rows)}, sum_y={total_y}" + + results.append(run_test("Ray Data", test_ray_data)) + + # ------------------------------------------------------------------------- + # Ray Tune + # ------------------------------------------------------------------------- + + if not args.skip_tune: + + def test_ray_tune() -> str: + if not module_exists("ray.tune"): + raise SkipTest("未安装 Ray Tune,请安装 ray[tune] 或 ray[all]") + + from ray import tune + from ray.tune import RunConfig + + temp_dir = tempfile.mkdtemp(prefix="ray_verify_tune_") + + def trainable(config: dict[str, Any]) -> None: + # 在 Ray 2.5x 中,Tune function trainable 推荐使用 ray.train.report。 + from ray import tune + + score = config["x"] * 2 + tune.report({"score": score}) + + tuner = tune.Tuner( + trainable, + param_space={ + "x": tune.grid_search([1, 2, 3]), + }, + run_config=RunConfig( + name="ray_verify_tune", + storage_path=temp_dir + ), + ) + + result_grid = tuner.fit() + + if result_grid.errors: + raise RuntimeError(f"Tune trials 出现错误: {result_grid.errors}") + + best = result_grid.get_best_result(metric="score", mode="max") + best_score = best.metrics.get("score") + + assert best_score == 6, f"best_score 不符合预期: {best_score}" + + return f"✓ 通过|| Ray Tune 正常,best_score={best_score}, path={temp_dir}" + + results.append(run_test("Ray Tune", test_ray_tune)) + + # ------------------------------------------------------------------------- + # Ray Train + # ------------------------------------------------------------------------- + + if not args.skip_train: + + def test_ray_train_torch() -> str: + if not module_exists("ray.train"): + raise SkipTest("未安装 Ray Train,请安装 ray[train] 或 ray[all]") + + if not module_exists("torch"): + raise SkipTest("未安装 torch,跳过 TorchTrainer 测试") + + import torch + from ray.train import Checkpoint, RunConfig, ScalingConfig + from ray.train.torch import TorchTrainer + + total_cpu = int(float(ray.cluster_resources().get("CPU", 1))) + num_workers = max(1, min(args.train_workers, max(1, total_cpu))) + + use_gpu = bool(args.train_use_gpu) + + if use_gpu: + gpu_count = float(ray.cluster_resources().get("GPU", 0)) + + if gpu_count < num_workers: + raise RuntimeError( + f"Ray GPU 数量不足,要求 train_workers={num_workers}, " + f"实际 GPU={gpu_count}" + ) + + temp_dir = tempfile.mkdtemp(prefix="ray_verify_train_") + + def train_loop_per_worker(config: dict[str, Any] | None = None) -> None: + import os + import tempfile + import torch + + from ray import train + from ray.train import Checkpoint + + ctx = train.get_context() + world_rank = ctx.get_world_rank() + + x = torch.tensor([[0.0], [1.0], [2.0], [3.0]]) + y = 2.0 * x + 1.0 + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model = torch.nn.Linear(1, 1).to(device) + optimizer = torch.optim.SGD(model.parameters(), lr=0.05) + loss_fn = torch.nn.MSELoss() + + x = x.to(device) + y = y.to(device) + + last_loss = None + + for _ in range(50): + pred = model(x) + loss = loss_fn(pred, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + last_loss = float(loss.detach().cpu().item()) + + metrics = { + "loss": last_loss, + "device": str(device), + "world_rank": world_rank, + } + + # Ray Train V2 对 metrics 的持久化更依赖 checkpoint。 + # rank 0 保存 checkpoint,并把 loss 写进 checkpoint 文件,外部可稳定读取。 + if world_rank == 0: + with tempfile.TemporaryDirectory() as checkpoint_dir: + checkpoint_path = os.path.join(checkpoint_dir, "state.pt") + torch.save(metrics, checkpoint_path) + + train.report( + metrics, + checkpoint=Checkpoint.from_directory(checkpoint_dir), + ) + else: + train.report(metrics) + + trainer = TorchTrainer( + train_loop_per_worker=train_loop_per_worker, + scaling_config=ScalingConfig( + num_workers=num_workers, + use_gpu=use_gpu, + ), + run_config=RunConfig( + name="ray_verify_train", + storage_path=temp_dir, + ), + ) + + result = trainer.fit() + + metrics = getattr(result, "metrics", {}) or {} + loss = metrics.get("loss") + device = metrics.get("device") + + # 某些 Ray Train V2 场景下 result.metrics 可能为空; + # 因此从 checkpoint 中兜底读取 loss。 + if loss is None: + checkpoint = getattr(result, "checkpoint", None) + + if checkpoint is not None: + with checkpoint.as_directory() as checkpoint_dir: + checkpoint_path = Path(checkpoint_dir) / "state.pt" + + if checkpoint_path.exists(): + payload = torch.load( + checkpoint_path, + map_location="cpu", + weights_only=False, + ) + + if isinstance(payload, dict): + loss = payload.get("loss") + device = payload.get("device") + + if loss is None: + raise RuntimeError( + f"Train 结果中没有 loss,metrics={metrics}, " + f"checkpoint={getattr(result, 'checkpoint', None)}" + ) + + if float(loss) > 1.0: + raise RuntimeError(f"训练 loss 偏高,loss={loss}") + + return ( + f"✓ 通过|| Ray Train TorchTrainer 正常,workers={num_workers}, " + f"use_gpu={use_gpu}, device={device}, loss={float(loss):.6f}, " + f"path={temp_dir}" + ) + + results.append(run_test("Ray Train - TorchTrainer", test_ray_train_torch)) + + # ------------------------------------------------------------------------- + # Ray Serve + # ------------------------------------------------------------------------- + + if not args.skip_serve: + + def test_ray_serve() -> str: + if not module_exists("ray.serve"): + raise SkipTest("未安装 Ray Serve,请安装 ray[serve] 或 ray[all]") + + from ray import serve + + try: + serve.shutdown() + except Exception: + pass + + @serve.deployment(ray_actor_options={"num_cpus": 0}) + class EchoDeployment: + async def __call__(self, value: str = "hello") -> str: + return f"serve:{value}" + + handle = serve.run( + EchoDeployment.bind(), + name="ray_verify_serve_app", + route_prefix="/ray-verify", + ) + + # 新版 Ray Serve 的 handle.remote() 返回 DeploymentResponse, + # 不是普通 ObjectRef,因此不能 ray.get(handle.remote(...))。 + response = handle.remote("ok") + result = response.result(timeout_s=30) + + try: + serve.shutdown() + except Exception: + pass + + assert result == "serve:ok", f"Serve 返回不符合预期: {result}" + + return "✓ 通过|| Ray Serve 正常,DeploymentResponse.result() 调用成功" + + results.append(run_test("Ray Serve", test_ray_serve)) + + # ------------------------------------------------------------------------- + # RLlib + # ------------------------------------------------------------------------- + + if not args.skip_rllib: + + def test_rllib() -> str: + if not args.full: + raise SkipTest("RLlib 测试较重,使用 --full 开启") + + if not module_exists("ray.rllib"): + raise SkipTest("未安装 RLlib,请安装 ray[rllib] 或 ray[all]") + + if not module_exists("gymnasium"): + raise SkipTest("未安装 gymnasium,RLlib CartPole 测试跳过") + + if not module_exists("torch"): + raise SkipTest("未安装 torch,RLlib PPO torch 测试跳过") + + from ray.rllib.algorithms.ppo import PPOConfig + + config = PPOConfig() + config = config.environment("CartPole-v1") + + if hasattr(config, "framework"): + config = config.framework("torch") + + # Ray 2.55 默认使用新 API stack。 + # 新版本用 env_runners;旧版本用 rollouts。 + if hasattr(config, "env_runners"): + config = config.env_runners(num_env_runners=0) + else: + config = config.rollouts(num_rollout_workers=0) + + # 兼容新旧训练参数命名。 + try: + config = config.training( + train_batch_size_per_learner=64, + minibatch_size=32, + num_epochs=1, + lr=1e-3, + ) + except TypeError: + config = config.training( + train_batch_size=64, + sgd_minibatch_size=32, + num_sgd_iter=1, + lr=1e-3, + ) + + if hasattr(config, "build_algo"): + algo = config.build_algo() + else: + algo = config.build() + + try: + train_result = algo.train() + finally: + algo.stop() + + episode_reward_mean = train_result.get("episode_reward_mean") + + return ( + "✓ 通过|| RLlib PPO 正常完成一次训练," + f"episode_reward_mean={episode_reward_mean}" + ) + + results.append(run_test("RLlib", test_rllib)) + + # ------------------------------------------------------------------------- + # Summary + # ------------------------------------------------------------------------- + + print("\n" + "=" * 80) + print("验证结果汇总") + print("=" * 80) + + status_counts = { + "PASS": sum(1 for r in results if r.status == "PASS"), + "SKIP": sum(1 for r in results if r.status == "SKIP"), + "FAIL": sum(1 for r in results if r.status == "FAIL"), + } + + for r in results: + print(f"{r.status:4} | {r.seconds:7.2f}s | {r.name} | {r.message}") + + print_json("Status counts:", status_counts) + + try: + ray.shutdown() + except Exception: + pass + + if status_counts["FAIL"] > 0: + print("\n✗ 失败|| 最终结果:FAILED") + sys.exit(1) + + print("\n✓ 通过|| 最终结果:PASSED") + sys.exit(0) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/frameworks/Ray/2.54.1/test_result.png b/frameworks/Ray/2.54.1/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..79d62ee63e66dc5feee9dab21ce147449c48f422 Binary files /dev/null and b/frameworks/Ray/2.54.1/test_result.png differ