From ac4ab046f2dd1b2410bfc4694839313fb7386565 Mon Sep 17 00:00:00 2001
From: Anstarc <anstarc@example.com>
Date: Fri, 8 May 2026 13:59:07 +0800
Subject: [PATCH 1/6] feat: add DeepSpeed container images for OC9

---
 frameworks/deepspeed/0.18.4/Dockerfile |  54 +++
 frameworks/deepspeed/0.18.4/README.md  | 199 ++++++++
 frameworks/deepspeed/0.18.4/build.conf |   4 +
 frameworks/deepspeed/0.18.4/test.sh    | 602 +++++++++++++++++++++++++
 frameworks/deepspeed/0.18.5/Dockerfile |  54 +++
 frameworks/deepspeed/0.18.5/README.md  | 199 ++++++++
 frameworks/deepspeed/0.18.5/build.conf |   4 +
 frameworks/deepspeed/0.18.5/test.sh    | 602 +++++++++++++++++++++++++
 frameworks/deepspeed/0.18.6/Dockerfile |  54 +++
 frameworks/deepspeed/0.18.6/README.md  | 199 ++++++++
 frameworks/deepspeed/0.18.6/build.conf |   4 +
 frameworks/deepspeed/0.18.6/test.sh    | 602 +++++++++++++++++++++++++
 frameworks/deepspeed/0.18.7/Dockerfile |  54 +++
 frameworks/deepspeed/0.18.7/README.md  | 199 ++++++++
 frameworks/deepspeed/0.18.7/build.conf |   4 +
 frameworks/deepspeed/0.18.7/test.sh    | 602 +++++++++++++++++++++++++
 frameworks/deepspeed/0.18.8/Dockerfile |  54 +++
 frameworks/deepspeed/0.18.8/README.md  | 199 ++++++++
 frameworks/deepspeed/0.18.8/build.conf |   4 +
 frameworks/deepspeed/0.18.8/test.sh    | 602 +++++++++++++++++++++++++
 frameworks/deepspeed/0.18.9/Dockerfile |  54 +++
 frameworks/deepspeed/0.18.9/README.md  | 199 ++++++++
 frameworks/deepspeed/0.18.9/build.conf |   4 +
 frameworks/deepspeed/0.18.9/test.sh    | 602 +++++++++++++++++++++++++
 24 files changed, 5154 insertions(+)
 create mode 100644 frameworks/deepspeed/0.18.4/Dockerfile
 create mode 100644 frameworks/deepspeed/0.18.4/README.md
 create mode 100644 frameworks/deepspeed/0.18.4/build.conf
 create mode 100644 frameworks/deepspeed/0.18.4/test.sh
 create mode 100644 frameworks/deepspeed/0.18.5/Dockerfile
 create mode 100644 frameworks/deepspeed/0.18.5/README.md
 create mode 100644 frameworks/deepspeed/0.18.5/build.conf
 create mode 100644 frameworks/deepspeed/0.18.5/test.sh
 create mode 100644 frameworks/deepspeed/0.18.6/Dockerfile
 create mode 100644 frameworks/deepspeed/0.18.6/README.md
 create mode 100644 frameworks/deepspeed/0.18.6/build.conf
 create mode 100644 frameworks/deepspeed/0.18.6/test.sh
 create mode 100644 frameworks/deepspeed/0.18.7/Dockerfile
 create mode 100644 frameworks/deepspeed/0.18.7/README.md
 create mode 100644 frameworks/deepspeed/0.18.7/build.conf
 create mode 100644 frameworks/deepspeed/0.18.7/test.sh
 create mode 100644 frameworks/deepspeed/0.18.8/Dockerfile
 create mode 100644 frameworks/deepspeed/0.18.8/README.md
 create mode 100644 frameworks/deepspeed/0.18.8/build.conf
 create mode 100644 frameworks/deepspeed/0.18.8/test.sh
 create mode 100644 frameworks/deepspeed/0.18.9/Dockerfile
 create mode 100644 frameworks/deepspeed/0.18.9/README.md
 create mode 100644 frameworks/deepspeed/0.18.9/build.conf
 create mode 100644 frameworks/deepspeed/0.18.9/test.sh

diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile
new file mode 100644
index 0000000..b4d385f
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/Dockerfile
@@ -0,0 +1,54 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.4 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.4 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.4/README.md b/frameworks/deepspeed/0.18.4/README.md
new file mode 100644
index 0000000..9c6594f
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.4 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.4
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.4 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.4
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.4 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.4 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.4 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.4/build.conf b/frameworks/deepspeed/0.18.4/build.conf
new file mode 100644
index 0000000..2c23f5b
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.4 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.4
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh
new file mode 100644
index 0000000..a6f7c81
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.4}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.4 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.4" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.4, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile
new file mode 100644
index 0000000..a215d56
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/Dockerfile
@@ -0,0 +1,54 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.5 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.5 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.5/README.md b/frameworks/deepspeed/0.18.5/README.md
new file mode 100644
index 0000000..5a4760c
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.5 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.5
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.5 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.5
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.5 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.5 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.5 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.5/build.conf b/frameworks/deepspeed/0.18.5/build.conf
new file mode 100644
index 0000000..c52c734
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.5 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.5
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh
new file mode 100644
index 0000000..b2cde17
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.5}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.5 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.5" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.5, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile
new file mode 100644
index 0000000..8905449
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/Dockerfile
@@ -0,0 +1,54 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.6 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.6 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.6/README.md b/frameworks/deepspeed/0.18.6/README.md
new file mode 100644
index 0000000..09b1047
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.6 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.6
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.6 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.6
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.6 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.6 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.6 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.6/build.conf b/frameworks/deepspeed/0.18.6/build.conf
new file mode 100644
index 0000000..205926f
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.6 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.6
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh
new file mode 100644
index 0000000..c2a69d6
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.6}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.6 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.6" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.6, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile
new file mode 100644
index 0000000..af3462b
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/Dockerfile
@@ -0,0 +1,54 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.7 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.7 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.7/README.md b/frameworks/deepspeed/0.18.7/README.md
new file mode 100644
index 0000000..060c83a
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.7 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.7
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.7 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.7
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.7 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.7 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.7 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.7/build.conf b/frameworks/deepspeed/0.18.7/build.conf
new file mode 100644
index 0000000..6cd5d09
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.7 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.7
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh
new file mode 100644
index 0000000..11a5177
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.7}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.7 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.7" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.7, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile
new file mode 100644
index 0000000..eae2db7
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/Dockerfile
@@ -0,0 +1,54 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.8 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.8 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.8/README.md b/frameworks/deepspeed/0.18.8/README.md
new file mode 100644
index 0000000..bf1f696
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.8 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.8
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.8 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.8
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.8 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.8 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.8 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.8/build.conf b/frameworks/deepspeed/0.18.8/build.conf
new file mode 100644
index 0000000..2426ade
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.8 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.8
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh
new file mode 100644
index 0000000..ffce0af
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.8}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.8 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.8" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.8, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile
new file mode 100644
index 0000000..2b1b7aa
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/Dockerfile
@@ -0,0 +1,54 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.9 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.9 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.9/README.md b/frameworks/deepspeed/0.18.9/README.md
new file mode 100644
index 0000000..0ef05da
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.9 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.9
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.9 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.9
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.9 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.9 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.9 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.9/build.conf b/frameworks/deepspeed/0.18.9/build.conf
new file mode 100644
index 0000000..e3f3636
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.9 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.9
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh
new file mode 100644
index 0000000..cd9ee2d
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.9}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.9 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.9" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.9, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
-- 
Gitee


From 61507b067825a781da20467a28ec875eaae324dd Mon Sep 17 00:00:00 2001
From: Anstarc <anstarc@example.com>
Date: Fri, 8 May 2026 14:46:25 +0800
Subject: [PATCH 2/6] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=20Dockerfile=20?=
 =?UTF-8?q?=E4=B8=AD=20LD=5FLIBRARY=5FPATH=20=E6=9C=AA=E5=AE=9A=E4=B9=89?=
 =?UTF-8?q?=E5=8F=98=E9=87=8F=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frameworks/deepspeed/0.18.4/Dockerfile | 2 +-
 frameworks/deepspeed/0.18.5/Dockerfile | 2 +-
 frameworks/deepspeed/0.18.6/Dockerfile | 2 +-
 frameworks/deepspeed/0.18.7/Dockerfile | 2 +-
 frameworks/deepspeed/0.18.8/Dockerfile | 2 +-
 frameworks/deepspeed/0.18.9/Dockerfile | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile
index b4d385f..4ab736d 100644
--- a/frameworks/deepspeed/0.18.4/Dockerfile
+++ b/frameworks/deepspeed/0.18.4/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile
index a215d56..87c8d20 100644
--- a/frameworks/deepspeed/0.18.5/Dockerfile
+++ b/frameworks/deepspeed/0.18.5/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile
index 8905449..636a8ba 100644
--- a/frameworks/deepspeed/0.18.6/Dockerfile
+++ b/frameworks/deepspeed/0.18.6/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile
index af3462b..ac3ce98 100644
--- a/frameworks/deepspeed/0.18.7/Dockerfile
+++ b/frameworks/deepspeed/0.18.7/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile
index eae2db7..5965047 100644
--- a/frameworks/deepspeed/0.18.8/Dockerfile
+++ b/frameworks/deepspeed/0.18.8/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile
index 2b1b7aa..b7d354e 100644
--- a/frameworks/deepspeed/0.18.9/Dockerfile
+++ b/frameworks/deepspeed/0.18.9/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
-- 
Gitee


From 37f978182c9c8d9eb1eea9396a9b6b117f9bcafc Mon Sep 17 00:00:00 2001
From: Anstarc <anstarc@example.com>
Date: Fri, 8 May 2026 15:12:32 +0800
Subject: [PATCH 3/6] =?UTF-8?q?=E4=BF=AE=E6=94=B9dockerfile=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frameworks/deepspeed/0.18.4/Dockerfile | 6 ++++--
 frameworks/deepspeed/0.18.5/Dockerfile | 6 ++++--
 frameworks/deepspeed/0.18.6/Dockerfile | 6 ++++--
 frameworks/deepspeed/0.18.7/Dockerfile | 6 ++++--
 frameworks/deepspeed/0.18.8/Dockerfile | 6 ++++--
 frameworks/deepspeed/0.18.9/Dockerfile | 6 ++++--
 6 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile
index 4ab736d..ccd4607 100644
--- a/frameworks/deepspeed/0.18.4/Dockerfile
+++ b/frameworks/deepspeed/0.18.4/Dockerfile
@@ -16,7 +16,7 @@ RUN dnf install -y \
     && rm -rf /var/cache/yum/*
 
 # 设置 MPI 环境变量
-ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
@@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \
     py-cpuinfo \
     pydantic \
     hjson \
-    mpi4py
+    mpi4py \
+    "setuptools<82" \
+    wheel
 
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile
index 87c8d20..97e8917 100644
--- a/frameworks/deepspeed/0.18.5/Dockerfile
+++ b/frameworks/deepspeed/0.18.5/Dockerfile
@@ -16,7 +16,7 @@ RUN dnf install -y \
     && rm -rf /var/cache/yum/*
 
 # 设置 MPI 环境变量
-ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
@@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \
     py-cpuinfo \
     pydantic \
     hjson \
-    mpi4py
+    mpi4py \
+    "setuptools<82" \
+    wheel
 
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile
index 636a8ba..fafa95a 100644
--- a/frameworks/deepspeed/0.18.6/Dockerfile
+++ b/frameworks/deepspeed/0.18.6/Dockerfile
@@ -16,7 +16,7 @@ RUN dnf install -y \
     && rm -rf /var/cache/yum/*
 
 # 设置 MPI 环境变量
-ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
@@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \
     py-cpuinfo \
     pydantic \
     hjson \
-    mpi4py
+    mpi4py \
+    "setuptools<82" \
+    wheel
 
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile
index ac3ce98..edfe910 100644
--- a/frameworks/deepspeed/0.18.7/Dockerfile
+++ b/frameworks/deepspeed/0.18.7/Dockerfile
@@ -16,7 +16,7 @@ RUN dnf install -y \
     && rm -rf /var/cache/yum/*
 
 # 设置 MPI 环境变量
-ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
@@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \
     py-cpuinfo \
     pydantic \
     hjson \
-    mpi4py
+    mpi4py \
+    "setuptools<82" \
+    wheel
 
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile
index 5965047..8875191 100644
--- a/frameworks/deepspeed/0.18.8/Dockerfile
+++ b/frameworks/deepspeed/0.18.8/Dockerfile
@@ -16,7 +16,7 @@ RUN dnf install -y \
     && rm -rf /var/cache/yum/*
 
 # 设置 MPI 环境变量
-ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
@@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \
     py-cpuinfo \
     pydantic \
     hjson \
-    mpi4py
+    mpi4py \
+    "setuptools<82" \
+    wheel
 
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile
index b7d354e..7aa8ce9 100644
--- a/frameworks/deepspeed/0.18.9/Dockerfile
+++ b/frameworks/deepspeed/0.18.9/Dockerfile
@@ -16,7 +16,7 @@ RUN dnf install -y \
     && rm -rf /var/cache/yum/*
 
 # 设置 MPI 环境变量
-ENV PATH=/usr/lib64/openmpi/bin:$PATH
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
@@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \
     py-cpuinfo \
     pydantic \
     hjson \
-    mpi4py
+    mpi4py \
+    "setuptools<82" \
+    wheel
 
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
-- 
Gitee


From bdd703cbc2eae46121e66683140b0c2f67d7084d Mon Sep 17 00:00:00 2001
From: Anstarc <anstarc@example.com>
Date: Fri, 8 May 2026 15:51:05 +0800
Subject: [PATCH 4/6] =?UTF-8?q?test:=20=E5=9C=A8=200.18.4=20=E4=B8=AD?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20DS=5FACCELERATOR=20=E7=8E=AF=E5=A2=83?=
 =?UTF-8?q?=E5=8F=98=E9=87=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frameworks/deepspeed/0.18.4/Dockerfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile
index ccd4607..4b35106 100644
--- a/frameworks/deepspeed/0.18.4/Dockerfile
+++ b/frameworks/deepspeed/0.18.4/Dockerfile
@@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \
     evaluate \
     ninja \
     psutil \
+    deepspeed-kernels \
     py-cpuinfo \
     pydantic \
     hjson \
@@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \
     "setuptools<82" \
     wheel
 
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
     MAX_JOBS=8 \
-- 
Gitee


From f26f6ecc93789354f32637886d6acbb0cd02bc73 Mon Sep 17 00:00:00 2001
From: Anstarc <anstarc@example.com>
Date: Fri, 8 May 2026 17:21:24 +0800
Subject: [PATCH 5/6] =?UTF-8?q?=E6=B7=BB=E5=8A=A0DS=5FACCELERATOR=E7=8E=AF?=
 =?UTF-8?q?=E5=A2=83=E5=8F=98=E9=87=8F=E5=B9=B6=E4=BF=AE=E5=A4=8DCUDA?=
 =?UTF-8?q?=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frameworks/deepspeed/0.18.4/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.5/Dockerfile |  4 ++++
 frameworks/deepspeed/0.18.5/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.6/Dockerfile |  4 ++++
 frameworks/deepspeed/0.18.6/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.7/Dockerfile |  4 ++++
 frameworks/deepspeed/0.18.7/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.8/Dockerfile |  4 ++++
 frameworks/deepspeed/0.18.8/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.9/Dockerfile |  4 ++++
 frameworks/deepspeed/0.18.9/test.sh    | 26 +++++++++++++-------------
 11 files changed, 98 insertions(+), 78 deletions(-)

diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh
index a6f7c81..14516aa 100644
--- a/frameworks/deepspeed/0.18.4/test.sh
+++ b/frameworks/deepspeed/0.18.4/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
 if [ "$VERSION" = "0.18.4" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile
index 97e8917..07a5091 100644
--- a/frameworks/deepspeed/0.18.5/Dockerfile
+++ b/frameworks/deepspeed/0.18.5/Dockerfile
@@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \
     evaluate \
     ninja \
     psutil \
+    deepspeed-kernels \
     py-cpuinfo \
     pydantic \
     hjson \
@@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \
     "setuptools<82" \
     wheel
 
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
     MAX_JOBS=8 \
diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh
index b2cde17..bc757f7 100644
--- a/frameworks/deepspeed/0.18.5/test.sh
+++ b/frameworks/deepspeed/0.18.5/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
 if [ "$VERSION" = "0.18.5" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile
index fafa95a..d76a4d0 100644
--- a/frameworks/deepspeed/0.18.6/Dockerfile
+++ b/frameworks/deepspeed/0.18.6/Dockerfile
@@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \
     evaluate \
     ninja \
     psutil \
+    deepspeed-kernels \
     py-cpuinfo \
     pydantic \
     hjson \
@@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \
     "setuptools<82" \
     wheel
 
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
     MAX_JOBS=8 \
diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh
index c2a69d6..92e48ca 100644
--- a/frameworks/deepspeed/0.18.6/test.sh
+++ b/frameworks/deepspeed/0.18.6/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
 if [ "$VERSION" = "0.18.6" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile
index edfe910..8bfea71 100644
--- a/frameworks/deepspeed/0.18.7/Dockerfile
+++ b/frameworks/deepspeed/0.18.7/Dockerfile
@@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \
     evaluate \
     ninja \
     psutil \
+    deepspeed-kernels \
     py-cpuinfo \
     pydantic \
     hjson \
@@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \
     "setuptools<82" \
     wheel
 
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
     MAX_JOBS=8 \
diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh
index 11a5177..16c9b65 100644
--- a/frameworks/deepspeed/0.18.7/test.sh
+++ b/frameworks/deepspeed/0.18.7/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
 if [ "$VERSION" = "0.18.7" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile
index 8875191..a46adb2 100644
--- a/frameworks/deepspeed/0.18.8/Dockerfile
+++ b/frameworks/deepspeed/0.18.8/Dockerfile
@@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \
     evaluate \
     ninja \
     psutil \
+    deepspeed-kernels \
     py-cpuinfo \
     pydantic \
     hjson \
@@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \
     "setuptools<82" \
     wheel
 
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
     MAX_JOBS=8 \
diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh
index ffce0af..8c8cc23 100644
--- a/frameworks/deepspeed/0.18.8/test.sh
+++ b/frameworks/deepspeed/0.18.8/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
 if [ "$VERSION" = "0.18.8" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile
index 7aa8ce9..75aad1c 100644
--- a/frameworks/deepspeed/0.18.9/Dockerfile
+++ b/frameworks/deepspeed/0.18.9/Dockerfile
@@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \
     evaluate \
     ninja \
     psutil \
+    deepspeed-kernels \
     py-cpuinfo \
     pydantic \
     hjson \
@@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \
     "setuptools<82" \
     wheel
 
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
 # 编译安装 DeepSpeed
 RUN DS_BUILD_OPS=1 \
     MAX_JOBS=8 \
diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh
index cd9ee2d..bd3aae0 100644
--- a/frameworks/deepspeed/0.18.9/test.sh
+++ b/frameworks/deepspeed/0.18.9/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
 if [ "$VERSION" = "0.18.9" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" python3.11 -c "
+docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-- 
Gitee


From 29dedaee645df9727f7c34ed4befd30e37c0d0ce Mon Sep 17 00:00:00 2001
From: Anstarc <anstarc@example.com>
Date: Fri, 8 May 2026 17:57:22 +0800
Subject: [PATCH 6/6] =?UTF-8?q?=E6=B7=BB=E5=8A=A0CUDA=E5=BA=93=E8=B7=AF?=
 =?UTF-8?q?=E5=BE=84=E5=88=B0LD=5FLIBRARY=5FPATH?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frameworks/deepspeed/0.18.4/Dockerfile |  2 +-
 frameworks/deepspeed/0.18.4/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.5/Dockerfile |  2 +-
 frameworks/deepspeed/0.18.5/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.6/Dockerfile |  2 +-
 frameworks/deepspeed/0.18.6/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.7/Dockerfile |  2 +-
 frameworks/deepspeed/0.18.7/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.8/Dockerfile |  2 +-
 frameworks/deepspeed/0.18.8/test.sh    | 26 +++++++++++++-------------
 frameworks/deepspeed/0.18.9/Dockerfile |  2 +-
 frameworks/deepspeed/0.18.9/test.sh    | 26 +++++++++++++-------------
 12 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile
index 4b35106..3f958b4 100644
--- a/frameworks/deepspeed/0.18.4/Dockerfile
+++ b/frameworks/deepspeed/0.18.4/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh
index 14516aa..a6f7c81 100644
--- a/frameworks/deepspeed/0.18.4/test.sh
+++ b/frameworks/deepspeed/0.18.4/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
 if [ "$VERSION" = "0.18.4" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile
index 07a5091..bfc4238 100644
--- a/frameworks/deepspeed/0.18.5/Dockerfile
+++ b/frameworks/deepspeed/0.18.5/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh
index bc757f7..b2cde17 100644
--- a/frameworks/deepspeed/0.18.5/test.sh
+++ b/frameworks/deepspeed/0.18.5/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
 if [ "$VERSION" = "0.18.5" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile
index d76a4d0..b93a2bf 100644
--- a/frameworks/deepspeed/0.18.6/Dockerfile
+++ b/frameworks/deepspeed/0.18.6/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh
index 92e48ca..c2a69d6 100644
--- a/frameworks/deepspeed/0.18.6/test.sh
+++ b/frameworks/deepspeed/0.18.6/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
 if [ "$VERSION" = "0.18.6" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile
index 8bfea71..feb3dac 100644
--- a/frameworks/deepspeed/0.18.7/Dockerfile
+++ b/frameworks/deepspeed/0.18.7/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh
index 16c9b65..11a5177 100644
--- a/frameworks/deepspeed/0.18.7/test.sh
+++ b/frameworks/deepspeed/0.18.7/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
 if [ "$VERSION" = "0.18.7" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile
index a46adb2..78c36cb 100644
--- a/frameworks/deepspeed/0.18.8/Dockerfile
+++ b/frameworks/deepspeed/0.18.8/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh
index 8c8cc23..ffce0af 100644
--- a/frameworks/deepspeed/0.18.8/test.sh
+++ b/frameworks/deepspeed/0.18.8/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
 if [ "$VERSION" = "0.18.8" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile
index 75aad1c..1c3c236 100644
--- a/frameworks/deepspeed/0.18.9/Dockerfile
+++ b/frameworks/deepspeed/0.18.9/Dockerfile
@@ -17,7 +17,7 @@ RUN dnf install -y \
 
 # 设置 MPI 环境变量
 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
 
 RUN ln -s /usr/bin/python3.11 /usr/bin/python
 
diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh
index bd3aae0..cd9ee2d 100644
--- a/frameworks/deepspeed/0.18.9/test.sh
+++ b/frameworks/deepspeed/0.18.9/test.sh
@@ -21,7 +21,7 @@ fi
 
 # 测试 2: DeepSpeed 版本
 echo -n "测试 2: DeepSpeed 版本... "
-VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'")
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
 if [ "$VERSION" = "0.18.9" ]; then
     echo "✓ (版本: $VERSION)"
 else
@@ -57,7 +57,7 @@ fi
 
 # 测试 5: DeepSpeed 基础导入
 echo -n "测试 5: DeepSpeed 基础导入... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 
@@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 print(f'DeepSpeed version: {deepspeed.__version__}')
@@ -85,7 +85,7 @@ fi
 
 # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
 echo -n "测试 6: 简单 CNN 模型初始化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -186,7 +186,7 @@ fi
 
 # 测试 7: 训练步骤（前向+反向+优化）
 echo -n "测试 7: 训练步骤... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -279,7 +279,7 @@ fi
 
 # 测试 8: ZeRO Stage 1 优化
 echo -n "测试 8: ZeRO Stage 1 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -350,7 +350,7 @@ fi
 
 # 测试 9: ZeRO Stage 2 优化
 echo -n "测试 9: ZeRO Stage 2 优化... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import deepspeed
@@ -423,7 +423,7 @@ fi
 
 # 测试 10: Conv2d + FP16 混合精度训练
 echo -n "测试 10: Conv2d + FP16 混合精度训练... "
-docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then
     echo "✓"
 else
     echo "✗"
-    docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c "
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-- 
Gitee