diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3f958b4d6dbaf8f3299051b5c252ab97dd360530
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/Dockerfile
@@ -0,0 +1,60 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.4 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    deepspeed-kernels \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py \
+    "setuptools<82" \
+    wheel
+
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.4 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.4/README.md b/frameworks/deepspeed/0.18.4/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9c6594f26b109fe526802a9665677aeb9d9a5b3c
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.4 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.4
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.4 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.4
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.4 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.4 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.4 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.4/build.conf b/frameworks/deepspeed/0.18.4/build.conf
new file mode 100644
index 0000000000000000000000000000000000000000..2c23f5bc039712daab91c3e7ec3d71067485881d
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.4 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.4
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a6f7c81e1f51f99ad5bf7248e54d9c8e71863c1e
--- /dev/null
+++ b/frameworks/deepspeed/0.18.4/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.4}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.4 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.4" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.4, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..bfc4238e3526244d6b50157888d9f2406e12d858
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/Dockerfile
@@ -0,0 +1,60 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.5 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    deepspeed-kernels \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py \
+    "setuptools<82" \
+    wheel
+
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.5 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.5/README.md b/frameworks/deepspeed/0.18.5/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a4760c96dcbb00e1100821b5928332739dea605
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.5 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.5
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.5 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.5
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.5 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.5 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.5 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.5/build.conf b/frameworks/deepspeed/0.18.5/build.conf
new file mode 100644
index 0000000000000000000000000000000000000000..c52c73493f31969074f9ed0a95de3b3aa86a59ed
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.5 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.5
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b2cde173face9763e2fc2dfe69e3026826f6380d
--- /dev/null
+++ b/frameworks/deepspeed/0.18.5/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.5}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.5 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.5" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.5, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b93a2bfc6fc4f1f33ae53974f9140e1424c914a9
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/Dockerfile
@@ -0,0 +1,60 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.6 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    deepspeed-kernels \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py \
+    "setuptools<82" \
+    wheel
+
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.6 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.6/README.md b/frameworks/deepspeed/0.18.6/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..09b10474155cd4da2b8e8a3c6a436dacc8594046
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.6 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.6
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.6 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.6
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.6 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.6 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.6 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.6/build.conf b/frameworks/deepspeed/0.18.6/build.conf
new file mode 100644
index 0000000000000000000000000000000000000000..205926fb2bfdf39b9a3df1db7886ec9e79dd4623
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.6 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.6
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c2a69d60e0dc1853497fadaba8c8802ed8d9f4a3
--- /dev/null
+++ b/frameworks/deepspeed/0.18.6/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.6}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.6 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.6" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.6, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..feb3dac45e7ee8d918e06c3f6056d94a5a88f57d
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/Dockerfile
@@ -0,0 +1,60 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.7 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    deepspeed-kernels \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py \
+    "setuptools<82" \
+    wheel
+
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.7 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.7/README.md b/frameworks/deepspeed/0.18.7/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..060c83a0115b677465fd85714b091edb8bde646d
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.7 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.7
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.7 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.7
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.7 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.7 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.7 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.7/build.conf b/frameworks/deepspeed/0.18.7/build.conf
new file mode 100644
index 0000000000000000000000000000000000000000..6cd5d09839deb85afa1a4e055223cb16ff7d5147
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.7 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.7
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..11a51778afc2118986952827044cf45a40110267
--- /dev/null
+++ b/frameworks/deepspeed/0.18.7/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.7}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.7 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.7" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.7, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..78c36cb07f0904d4e567d8d0f48ca543785efb57
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/Dockerfile
@@ -0,0 +1,60 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.8 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    deepspeed-kernels \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py \
+    "setuptools<82" \
+    wheel
+
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.8 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.8/README.md b/frameworks/deepspeed/0.18.8/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf1f696cfb5b5628bf5c5e6cf6990690ae8f272b
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.8 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.8
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.8 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.8
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.8 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.8 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.8 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.8/build.conf b/frameworks/deepspeed/0.18.8/build.conf
new file mode 100644
index 0000000000000000000000000000000000000000..2426ade14acc147d3b5c4fa031aeb60cb9fb6baf
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.8 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.8
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ffce0af546c76445ef44053ab1ed45675ad20543
--- /dev/null
+++ b/frameworks/deepspeed/0.18.8/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.8}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.8 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.8" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.8, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="
diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1c3c2366146bb50cb9742b3aea3abed71aea3b6a
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/Dockerfile
@@ -0,0 +1,60 @@
+FROM opencloudos/opencloudos9-cuda-devel:12.8
+
+LABEL maintainer="Anstarc <anstarc@vip.qq.com>"
+LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container"
+LABEL org.opencontainers.image.description="DeepSpeed 0.18.9 GPU on OpenCloudOS 9"
+
+# 安装系统依赖
+RUN dnf install -y \
+    python3.11 \
+    python3.11-pip \
+    git \
+    wget \
+    openmpi \
+    openmpi-devel \
+    && dnf clean all \
+    && rm -rf /var/cache/yum/*
+
+# 设置 MPI 环境变量
+ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64
+
+RUN ln -s /usr/bin/python3.11 /usr/bin/python
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装 PyTorch（CUDA 12.8）
+RUN pip3.11 install --no-cache-dir \
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# 安装 DeepSpeed 依赖
+RUN pip3.11 install --no-cache-dir \
+    transformers \
+    datasets \
+    evaluate \
+    ninja \
+    psutil \
+    deepspeed-kernels \
+    py-cpuinfo \
+    pydantic \
+    hjson \
+    mpi4py \
+    "setuptools<82" \
+    wheel
+
+# 设置 DeepSpeed 加速器环境变量
+ENV DS_ACCELERATOR=cuda
+
+# 编译安装 DeepSpeed
+RUN DS_BUILD_OPS=1 \
+    MAX_JOBS=8 \
+    pip3.11 install deepspeed==0.18.9 --no-build-isolation
+
+# 设置 GPU 环境变量
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV CUDA_MODULE_LOADING=LAZY
+
+# 默认命令
+CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"]
diff --git a/frameworks/deepspeed/0.18.9/README.md b/frameworks/deepspeed/0.18.9/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ef05da4acba0f16ebc879ede3d855ee687a46c5
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/README.md
@@ -0,0 +1,199 @@
+# DeepSpeed 0.18.9 GPU on OpenCloudOS 9
+
+DeepSpeed 是微软开源的深度学习优化库，提供 ZeRO 内存优化、混合精度训练、流水线并行等功能，支持超大规模模型训练。
+
+## 基本信息
+
+- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8
+- **Python 版本**: 3.11
+- **CUDA 版本**: 12.8
+- **DeepSpeed 版本**: 0.18.9
+- **PyTorch 版本**: 最新稳定版（CUDA 12.8）
+- **MPI**: OpenMPI（支持多节点训练）
+
+## 构建
+
+```bash
+docker build -t oc9-deepspeed:0.18.9 .
+```
+
+## 测试
+
+```bash
+./test.sh oc9-deepspeed:0.18.9
+```
+
+测试项包括：
+- Python 和 CUDA 环境
+- DeepSpeed 版本验证
+- GPU 可用性检查
+- DeepSpeed 初始化
+- CNN 模型训练（CIFAR-10 风格）
+- ZeRO Stage 1/2 优化
+- FP16 混合精度训练
+
+## 使用示例
+
+### 基本使用
+
+```bash
+# 查看版本信息
+docker run --rm oc9-deepspeed:0.18.9 python3.11 -c "import deepspeed; print(deepspeed.__version__)"
+
+# 查看可用设备
+docker run --rm --gpus all oc9-deepspeed:0.18.9 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+```
+
+### 单 GPU 训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 定义模型
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练循环
+for batch in dataloader:
+    inputs, labels = batch
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### FP16 混合精度训练
+
+```python
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+
+# FP16 配置
+ds_config = {
+    'train_batch_size': 16,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型
+target_dtype = torch.half if model_engine.fp16_enabled() else None
+
+# 训练时转换输入数据类型
+for batch in dataloader:
+    inputs, labels = batch
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = criterion(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+```
+
+### 多 GPU 训练（单节点）
+
+```bash
+# 使用 deepspeed launcher
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \
+    deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json
+
+# 或使用 torchrun
+docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \
+    torchrun --nproc_per_node=2 train.py
+```
+
+### ZeRO Stage 2 优化
+
+```python
+ds_config = {
+    'train_batch_size': 32,
+    'train_micro_batch_size_per_gpu': 4,
+    'gradient_accumulation_steps': 2,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.9, 0.999],
+            'eps': 1e-8
+        }
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+```
+
+### 交互式使用
+
+```bash
+docker run --rm -it --gpus all oc9-deepspeed:0.18.9 bash
+```
+
+## 支持的功能
+
+- **ZeRO 优化**: Stage 1/2/3 内存优化，支持超大模型训练
+- **混合精度**: FP16, BF16 自动混合精度训练
+- **梯度累积**: 模拟大 batch size 训练
+- **流水线并行**: 模型层间并行
+- **张量并行**: 层内并行
+- **CPU Offload**: 将优化器状态卸载到 CPU
+- **梯度检查点**: 减少显存占用
+- **分布式训练**: 单机多卡、多机多卡
+
+## 系统要求
+
+- **GPU**: NVIDIA GPU with CUDA 12.x support
+- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage)
+- **Docker**: 19.03+ with nvidia-docker2
+- **MPI**: 已包含 OpenMPI（多节点训练需要）
+
+## 参考资源
+
+- [DeepSpeed 官方文档](https://www.deepspeed.ai/)
+- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples)
+- [ZeRO 论文](https://arxiv.org/abs/1910.02054)
diff --git a/frameworks/deepspeed/0.18.9/build.conf b/frameworks/deepspeed/0.18.9/build.conf
new file mode 100644
index 0000000000000000000000000000000000000000..e3f36364ef81ddac9c988fd53bf7de8c34b25814
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/build.conf
@@ -0,0 +1,4 @@
+# DeepSpeed 0.18.9 on OpenCloudOS 9 (GPU)
+IMAGE_NAME=oc9-deepspeed
+IMAGE_TAG=0.18.9
+GPU_TEST=true
diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cd9ee2dc0d1beb4a3ae0cefd4cbeb0b92d005d60
--- /dev/null
+++ b/frameworks/deepspeed/0.18.9/test.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+set -e
+
+IMAGE="${1:-oc9-deepspeed:0.18.9}"
+
+echo "=========================================="
+echo "测试 DeepSpeed 0.18.9 GPU 镜像"
+echo "镜像: $IMAGE"
+echo "=========================================="
+
+# 测试 1: Python 环境
+echo -n "测试 1: Python 环境... "
+docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 --version
+    exit 1
+fi
+
+# 测试 2: DeepSpeed 版本
+echo -n "测试 2: DeepSpeed 版本... "
+VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)")
+if [ "$VERSION" = "0.18.9" ]; then
+    echo "✓ (版本: $VERSION)"
+else
+    echo "✗ (期望: 0.18.9, 实际: $VERSION)"
+    exit 1
+fi
+
+# 测试 3: 核心依赖
+echo -n "测试 3: 核心依赖 (torch, transformers)... "
+docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers"
+    exit 1
+fi
+
+# 测试 4: CUDA 环境
+echo -n "测试 4: CUDA 环境... "
+GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+assert torch.cuda.is_available(), 'CUDA not available'
+print(torch.cuda.get_device_name(0))
+")
+if [ $? -eq 0 ]; then
+    echo "✓"
+    echo "         GPU: $GPU_INFO"
+else
+    echo "✗"
+    exit 1
+fi
+
+# 测试 5: DeepSpeed 基础导入
+echo -n "测试 5: DeepSpeed 基础导入... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# 验证核心模块可用
+assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found'
+assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found'
+
+# 验证加速器
+accelerator = get_accelerator()
+assert accelerator is not None, 'Accelerator not available'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+print(f'DeepSpeed version: {deepspeed.__version__}')
+accelerator = get_accelerator()
+print(f'Accelerator: {accelerator}')
+"
+    exit 1
+fi
+
+# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化
+echo -n "测试 6: 简单 CNN 模型初始化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+# 创建简单 CNN 模型（类似 CIFAR-10 示例）
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(torch.relu(self.conv1(x)))
+        x = self.pool(torch.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+# DeepSpeed 配置
+ds_config = {
+    'train_batch_size': 4,
+    'gradient_accumulation_steps': 1,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+# 初始化 DeepSpeed
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+assert model_engine is not None, 'Model engine initialization failed'
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(10, 2)
+    def forward(self, x):
+        return self.fc(x)
+
+model = SimpleNet()
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('Model initialized successfully')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 7: 训练步骤（前向+反向+优化）
+echo -n "测试 7: 训练步骤... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = SimpleNet()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 模拟训练步骤
+for step in range(3):
+    # 创建假数据 (batch_size=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(10, 2)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 0}
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+inputs = torch.randn(4, 10).to(model_engine.device)
+labels = torch.randint(0, 2, (4,)).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = torch.nn.functional.cross_entropy(outputs, labels)
+model_engine.backward(loss)
+model_engine.step()
+print('Training step completed')
+"
+    exit 1
+fi
+
+# 测试 8: ZeRO Stage 1 优化
+echo -n "测试 8: ZeRO Stage 1 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 1 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 1,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 简单训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 1}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 1 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 9: ZeRO Stage 2 优化
+echo -n "测试 9: ZeRO Stage 2 优化... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Sequential(
+    nn.Linear(100, 50),
+    nn.ReLU(),
+    nn.Linear(50, 10)
+)
+
+# ZeRO Stage 2 配置
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {'lr': 0.001}
+    },
+    'zero_optimization': {
+        'stage': 2,
+        'allgather_partitions': True,
+        'reduce_scatter': True,
+        'allgather_bucket_size': 50000000,
+        'reduce_bucket_size': 50000000,
+        'overlap_comm': True,
+        'contiguous_gradients': True
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 训练步骤
+inputs = torch.randn(4, 100).to(model_engine.device)
+outputs = model_engine(inputs)
+loss = outputs.sum()
+model_engine.backward(loss)
+model_engine.step()
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import deepspeed
+
+model = nn.Linear(100, 10)
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'zero_optimization': {'stage': 2}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    print('ZeRO Stage 2 initialized')
+except Exception as e:
+    print(f'Error: {e}')
+"
+    exit 1
+fi
+
+# 测试 10: Conv2d + FP16 混合精度训练
+echo -n "测试 10: Conv2d + FP16 混合精度训练... "
+docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# CIFAR-10 风格的 CNN 模型
+class CifarNet(nn.Module):
+    def __init__(self):
+        super(CifarNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = CifarNet()
+
+# FP16 配置（参考 DeepSpeed CIFAR 示例）
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {
+        'type': 'Adam',
+        'params': {
+            'lr': 0.001,
+            'betas': [0.8, 0.999],
+            'eps': 1e-8,
+            'weight_decay': 3e-7
+        }
+    },
+    'scheduler': {
+        'type': 'WarmupLR',
+        'params': {
+            'warmup_min_lr': 0,
+            'warmup_max_lr': 0.001,
+            'warmup_num_steps': 100
+        }
+    },
+    'gradient_clipping': 1.0,
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'loss_scale_window': 500,
+        'hysteresis': 2,
+        'min_loss_scale': 1,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {
+        'stage': 0
+    }
+}
+
+model_engine, optimizer, _, _ = deepspeed.initialize(
+    model=model,
+    model_parameters=model.parameters(),
+    config=ds_config
+)
+
+# 获取目标数据类型（关键！参考官方 CIFAR 示例）
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype = torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype = torch.half
+
+# 验证 FP16 已启用
+assert model_engine.fp16_enabled(), 'FP16 not enabled'
+assert target_dtype == torch.half, 'target_dtype should be torch.half'
+
+# 训练循环
+model_engine.train()
+
+for step in range(3):
+    # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32)
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    # 关键：手动转换输入数据类型为 FP16（参考官方示例）
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    # 前向传播
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    
+    # 反向传播
+    model_engine.backward(loss)
+    
+    # 优化器步骤
+    model_engine.step()
+
+" > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓"
+else
+    echo "✗"
+    docker run --rm --gpus all "$IMAGE" python3.11 -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+
+# 简化的 CNN 模型用于调试
+class SimpleConv(nn.Module):
+    def __init__(self):
+        super(SimpleConv, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 10)
+    
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = x.view(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        return x
+
+model = SimpleConv()
+
+ds_config = {
+    'train_batch_size': 4,
+    'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}},
+    'fp16': {
+        'enabled': True,
+        'loss_scale': 0,
+        'initial_scale_power': 15
+    },
+    'zero_optimization': {'stage': 0}
+}
+
+try:
+    model_engine, optimizer, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config
+    )
+    
+    # 获取目标数据类型
+    target_dtype = None
+    if model_engine.fp16_enabled():
+        target_dtype = torch.half
+    
+    model_engine.train()
+    
+    # 训练步骤
+    inputs = torch.randn(4, 3, 32, 32).to(model_engine.device)
+    labels = torch.randint(0, 10, (4,)).to(model_engine.device)
+    
+    if target_dtype is not None:
+        inputs = inputs.to(target_dtype)
+    
+    outputs = model_engine(inputs)
+    loss = F.cross_entropy(outputs, labels)
+    model_engine.backward(loss)
+    model_engine.step()
+    
+    print(f'FP16 enabled: {model_engine.fp16_enabled()}')
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()
+"
+    exit 1
+fi
+
+echo ""
+echo "=========================================="
+echo "所有测试通过！"
+echo "=========================================="