From ac4ab046f2dd1b2410bfc4694839313fb7386565 Mon Sep 17 00:00:00 2001 From: Anstarc Date: Fri, 8 May 2026 13:59:07 +0800 Subject: [PATCH 1/6] feat: add DeepSpeed container images for OC9 --- frameworks/deepspeed/0.18.4/Dockerfile | 54 +++ frameworks/deepspeed/0.18.4/README.md | 199 ++++++++ frameworks/deepspeed/0.18.4/build.conf | 4 + frameworks/deepspeed/0.18.4/test.sh | 602 +++++++++++++++++++++++++ frameworks/deepspeed/0.18.5/Dockerfile | 54 +++ frameworks/deepspeed/0.18.5/README.md | 199 ++++++++ frameworks/deepspeed/0.18.5/build.conf | 4 + frameworks/deepspeed/0.18.5/test.sh | 602 +++++++++++++++++++++++++ frameworks/deepspeed/0.18.6/Dockerfile | 54 +++ frameworks/deepspeed/0.18.6/README.md | 199 ++++++++ frameworks/deepspeed/0.18.6/build.conf | 4 + frameworks/deepspeed/0.18.6/test.sh | 602 +++++++++++++++++++++++++ frameworks/deepspeed/0.18.7/Dockerfile | 54 +++ frameworks/deepspeed/0.18.7/README.md | 199 ++++++++ frameworks/deepspeed/0.18.7/build.conf | 4 + frameworks/deepspeed/0.18.7/test.sh | 602 +++++++++++++++++++++++++ frameworks/deepspeed/0.18.8/Dockerfile | 54 +++ frameworks/deepspeed/0.18.8/README.md | 199 ++++++++ frameworks/deepspeed/0.18.8/build.conf | 4 + frameworks/deepspeed/0.18.8/test.sh | 602 +++++++++++++++++++++++++ frameworks/deepspeed/0.18.9/Dockerfile | 54 +++ frameworks/deepspeed/0.18.9/README.md | 199 ++++++++ frameworks/deepspeed/0.18.9/build.conf | 4 + frameworks/deepspeed/0.18.9/test.sh | 602 +++++++++++++++++++++++++ 24 files changed, 5154 insertions(+) create mode 100644 frameworks/deepspeed/0.18.4/Dockerfile create mode 100644 frameworks/deepspeed/0.18.4/README.md create mode 100644 frameworks/deepspeed/0.18.4/build.conf create mode 100644 frameworks/deepspeed/0.18.4/test.sh create mode 100644 frameworks/deepspeed/0.18.5/Dockerfile create mode 100644 frameworks/deepspeed/0.18.5/README.md create mode 100644 frameworks/deepspeed/0.18.5/build.conf create mode 100644 frameworks/deepspeed/0.18.5/test.sh create mode 100644 frameworks/deepspeed/0.18.6/Dockerfile create mode 100644 frameworks/deepspeed/0.18.6/README.md create mode 100644 frameworks/deepspeed/0.18.6/build.conf create mode 100644 frameworks/deepspeed/0.18.6/test.sh create mode 100644 frameworks/deepspeed/0.18.7/Dockerfile create mode 100644 frameworks/deepspeed/0.18.7/README.md create mode 100644 frameworks/deepspeed/0.18.7/build.conf create mode 100644 frameworks/deepspeed/0.18.7/test.sh create mode 100644 frameworks/deepspeed/0.18.8/Dockerfile create mode 100644 frameworks/deepspeed/0.18.8/README.md create mode 100644 frameworks/deepspeed/0.18.8/build.conf create mode 100644 frameworks/deepspeed/0.18.8/test.sh create mode 100644 frameworks/deepspeed/0.18.9/Dockerfile create mode 100644 frameworks/deepspeed/0.18.9/README.md create mode 100644 frameworks/deepspeed/0.18.9/build.conf create mode 100644 frameworks/deepspeed/0.18.9/test.sh diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile new file mode 100644 index 0000000..b4d385f --- /dev/null +++ b/frameworks/deepspeed/0.18.4/Dockerfile @@ -0,0 +1,54 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.4 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.4 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.4/README.md b/frameworks/deepspeed/0.18.4/README.md new file mode 100644 index 0000000..9c6594f --- /dev/null +++ b/frameworks/deepspeed/0.18.4/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.4 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.4 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.4 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.4 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.4 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.4 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.4 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.4/build.conf b/frameworks/deepspeed/0.18.4/build.conf new file mode 100644 index 0000000..2c23f5b --- /dev/null +++ b/frameworks/deepspeed/0.18.4/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.4 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.4 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh new file mode 100644 index 0000000..a6f7c81 --- /dev/null +++ b/frameworks/deepspeed/0.18.4/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.4}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.4 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.4" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.4, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile new file mode 100644 index 0000000..a215d56 --- /dev/null +++ b/frameworks/deepspeed/0.18.5/Dockerfile @@ -0,0 +1,54 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.5 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.5 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.5/README.md b/frameworks/deepspeed/0.18.5/README.md new file mode 100644 index 0000000..5a4760c --- /dev/null +++ b/frameworks/deepspeed/0.18.5/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.5 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.5 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.5 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.5 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.5 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.5 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.5 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.5/build.conf b/frameworks/deepspeed/0.18.5/build.conf new file mode 100644 index 0000000..c52c734 --- /dev/null +++ b/frameworks/deepspeed/0.18.5/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.5 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.5 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh new file mode 100644 index 0000000..b2cde17 --- /dev/null +++ b/frameworks/deepspeed/0.18.5/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.5}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.5 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.5" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.5, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile new file mode 100644 index 0000000..8905449 --- /dev/null +++ b/frameworks/deepspeed/0.18.6/Dockerfile @@ -0,0 +1,54 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.6 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.6 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.6/README.md b/frameworks/deepspeed/0.18.6/README.md new file mode 100644 index 0000000..09b1047 --- /dev/null +++ b/frameworks/deepspeed/0.18.6/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.6 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.6 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.6 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.6 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.6 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.6 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.6 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.6/build.conf b/frameworks/deepspeed/0.18.6/build.conf new file mode 100644 index 0000000..205926f --- /dev/null +++ b/frameworks/deepspeed/0.18.6/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.6 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.6 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh new file mode 100644 index 0000000..c2a69d6 --- /dev/null +++ b/frameworks/deepspeed/0.18.6/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.6}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.6 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.6" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.6, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile new file mode 100644 index 0000000..af3462b --- /dev/null +++ b/frameworks/deepspeed/0.18.7/Dockerfile @@ -0,0 +1,54 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.7 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.7 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.7/README.md b/frameworks/deepspeed/0.18.7/README.md new file mode 100644 index 0000000..060c83a --- /dev/null +++ b/frameworks/deepspeed/0.18.7/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.7 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.7 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.7 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.7 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.7 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.7 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.7 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.7/build.conf b/frameworks/deepspeed/0.18.7/build.conf new file mode 100644 index 0000000..6cd5d09 --- /dev/null +++ b/frameworks/deepspeed/0.18.7/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.7 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.7 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh new file mode 100644 index 0000000..11a5177 --- /dev/null +++ b/frameworks/deepspeed/0.18.7/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.7}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.7 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.7" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.7, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile new file mode 100644 index 0000000..eae2db7 --- /dev/null +++ b/frameworks/deepspeed/0.18.8/Dockerfile @@ -0,0 +1,54 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.8 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.8 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.8/README.md b/frameworks/deepspeed/0.18.8/README.md new file mode 100644 index 0000000..bf1f696 --- /dev/null +++ b/frameworks/deepspeed/0.18.8/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.8 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.8 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.8 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.8 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.8 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.8 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.8 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.8/build.conf b/frameworks/deepspeed/0.18.8/build.conf new file mode 100644 index 0000000..2426ade --- /dev/null +++ b/frameworks/deepspeed/0.18.8/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.8 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.8 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh new file mode 100644 index 0000000..ffce0af --- /dev/null +++ b/frameworks/deepspeed/0.18.8/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.8}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.8 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.8" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.8, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile new file mode 100644 index 0000000..2b1b7aa --- /dev/null +++ b/frameworks/deepspeed/0.18.9/Dockerfile @@ -0,0 +1,54 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.9 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.9 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.9/README.md b/frameworks/deepspeed/0.18.9/README.md new file mode 100644 index 0000000..0ef05da --- /dev/null +++ b/frameworks/deepspeed/0.18.9/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.9 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.9 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.9 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.9 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.9 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.9 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.9 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.9/build.conf b/frameworks/deepspeed/0.18.9/build.conf new file mode 100644 index 0000000..e3f3636 --- /dev/null +++ b/frameworks/deepspeed/0.18.9/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.9 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.9 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh new file mode 100644 index 0000000..cd9ee2d --- /dev/null +++ b/frameworks/deepspeed/0.18.9/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.9}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.9 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.9" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.9, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" -- Gitee From 61507b067825a781da20467a28ec875eaae324dd Mon Sep 17 00:00:00 2001 From: Anstarc Date: Fri, 8 May 2026 14:46:25 +0800 Subject: [PATCH 2/6] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=20Dockerfile=20?= =?UTF-8?q?=E4=B8=AD=20LD=5FLIBRARY=5FPATH=20=E6=9C=AA=E5=AE=9A=E4=B9=89?= =?UTF-8?q?=E5=8F=98=E9=87=8F=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frameworks/deepspeed/0.18.4/Dockerfile | 2 +- frameworks/deepspeed/0.18.5/Dockerfile | 2 +- frameworks/deepspeed/0.18.6/Dockerfile | 2 +- frameworks/deepspeed/0.18.7/Dockerfile | 2 +- frameworks/deepspeed/0.18.8/Dockerfile | 2 +- frameworks/deepspeed/0.18.9/Dockerfile | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile index b4d385f..4ab736d 100644 --- a/frameworks/deepspeed/0.18.4/Dockerfile +++ b/frameworks/deepspeed/0.18.4/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile index a215d56..87c8d20 100644 --- a/frameworks/deepspeed/0.18.5/Dockerfile +++ b/frameworks/deepspeed/0.18.5/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile index 8905449..636a8ba 100644 --- a/frameworks/deepspeed/0.18.6/Dockerfile +++ b/frameworks/deepspeed/0.18.6/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile index af3462b..ac3ce98 100644 --- a/frameworks/deepspeed/0.18.7/Dockerfile +++ b/frameworks/deepspeed/0.18.7/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile index eae2db7..5965047 100644 --- a/frameworks/deepspeed/0.18.8/Dockerfile +++ b/frameworks/deepspeed/0.18.8/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile index 2b1b7aa..b7d354e 100644 --- a/frameworks/deepspeed/0.18.9/Dockerfile +++ b/frameworks/deepspeed/0.18.9/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python -- Gitee From 37f978182c9c8d9eb1eea9396a9b6b117f9bcafc Mon Sep 17 00:00:00 2001 From: Anstarc Date: Fri, 8 May 2026 15:12:32 +0800 Subject: [PATCH 3/6] =?UTF-8?q?=E4=BF=AE=E6=94=B9dockerfile=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frameworks/deepspeed/0.18.4/Dockerfile | 6 ++++-- frameworks/deepspeed/0.18.5/Dockerfile | 6 ++++-- frameworks/deepspeed/0.18.6/Dockerfile | 6 ++++-- frameworks/deepspeed/0.18.7/Dockerfile | 6 ++++-- frameworks/deepspeed/0.18.8/Dockerfile | 6 ++++-- frameworks/deepspeed/0.18.9/Dockerfile | 6 ++++-- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile index 4ab736d..ccd4607 100644 --- a/frameworks/deepspeed/0.18.4/Dockerfile +++ b/frameworks/deepspeed/0.18.4/Dockerfile @@ -16,7 +16,7 @@ RUN dnf install -y \ && rm -rf /var/cache/yum/* # 设置 MPI 环境变量 -ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python @@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \ py-cpuinfo \ pydantic \ hjson \ - mpi4py + mpi4py \ + "setuptools<82" \ + wheel # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile index 87c8d20..97e8917 100644 --- a/frameworks/deepspeed/0.18.5/Dockerfile +++ b/frameworks/deepspeed/0.18.5/Dockerfile @@ -16,7 +16,7 @@ RUN dnf install -y \ && rm -rf /var/cache/yum/* # 设置 MPI 环境变量 -ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python @@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \ py-cpuinfo \ pydantic \ hjson \ - mpi4py + mpi4py \ + "setuptools<82" \ + wheel # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile index 636a8ba..fafa95a 100644 --- a/frameworks/deepspeed/0.18.6/Dockerfile +++ b/frameworks/deepspeed/0.18.6/Dockerfile @@ -16,7 +16,7 @@ RUN dnf install -y \ && rm -rf /var/cache/yum/* # 设置 MPI 环境变量 -ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python @@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \ py-cpuinfo \ pydantic \ hjson \ - mpi4py + mpi4py \ + "setuptools<82" \ + wheel # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile index ac3ce98..edfe910 100644 --- a/frameworks/deepspeed/0.18.7/Dockerfile +++ b/frameworks/deepspeed/0.18.7/Dockerfile @@ -16,7 +16,7 @@ RUN dnf install -y \ && rm -rf /var/cache/yum/* # 设置 MPI 环境变量 -ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python @@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \ py-cpuinfo \ pydantic \ hjson \ - mpi4py + mpi4py \ + "setuptools<82" \ + wheel # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile index 5965047..8875191 100644 --- a/frameworks/deepspeed/0.18.8/Dockerfile +++ b/frameworks/deepspeed/0.18.8/Dockerfile @@ -16,7 +16,7 @@ RUN dnf install -y \ && rm -rf /var/cache/yum/* # 设置 MPI 环境变量 -ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python @@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \ py-cpuinfo \ pydantic \ hjson \ - mpi4py + mpi4py \ + "setuptools<82" \ + wheel # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile index b7d354e..7aa8ce9 100644 --- a/frameworks/deepspeed/0.18.9/Dockerfile +++ b/frameworks/deepspeed/0.18.9/Dockerfile @@ -16,7 +16,7 @@ RUN dnf install -y \ && rm -rf /var/cache/yum/* # 设置 MPI 环境变量 -ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib RUN ln -s /usr/bin/python3.11 /usr/bin/python @@ -39,7 +39,9 @@ RUN pip3.11 install --no-cache-dir \ py-cpuinfo \ pydantic \ hjson \ - mpi4py + mpi4py \ + "setuptools<82" \ + wheel # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ -- Gitee From bdd703cbc2eae46121e66683140b0c2f67d7084d Mon Sep 17 00:00:00 2001 From: Anstarc Date: Fri, 8 May 2026 15:51:05 +0800 Subject: [PATCH 4/6] =?UTF-8?q?test:=20=E5=9C=A8=200.18.4=20=E4=B8=AD?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20DS=5FACCELERATOR=20=E7=8E=AF=E5=A2=83?= =?UTF-8?q?=E5=8F=98=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frameworks/deepspeed/0.18.4/Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile index ccd4607..4b35106 100644 --- a/frameworks/deepspeed/0.18.4/Dockerfile +++ b/frameworks/deepspeed/0.18.4/Dockerfile @@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \ evaluate \ ninja \ psutil \ + deepspeed-kernels \ py-cpuinfo \ pydantic \ hjson \ @@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \ "setuptools<82" \ wheel +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ MAX_JOBS=8 \ -- Gitee From f26f6ecc93789354f32637886d6acbb0cd02bc73 Mon Sep 17 00:00:00 2001 From: Anstarc Date: Fri, 8 May 2026 17:21:24 +0800 Subject: [PATCH 5/6] =?UTF-8?q?=E6=B7=BB=E5=8A=A0DS=5FACCELERATOR=E7=8E=AF?= =?UTF-8?q?=E5=A2=83=E5=8F=98=E9=87=8F=E5=B9=B6=E4=BF=AE=E5=A4=8DCUDA?= =?UTF-8?q?=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frameworks/deepspeed/0.18.4/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.5/Dockerfile | 4 ++++ frameworks/deepspeed/0.18.5/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.6/Dockerfile | 4 ++++ frameworks/deepspeed/0.18.6/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.7/Dockerfile | 4 ++++ frameworks/deepspeed/0.18.7/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.8/Dockerfile | 4 ++++ frameworks/deepspeed/0.18.8/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.9/Dockerfile | 4 ++++ frameworks/deepspeed/0.18.9/test.sh | 26 +++++++++++++------------- 11 files changed, 98 insertions(+), 78 deletions(-) diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh index a6f7c81..14516aa 100644 --- a/frameworks/deepspeed/0.18.4/test.sh +++ b/frameworks/deepspeed/0.18.4/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") if [ "$VERSION" = "0.18.4" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile index 97e8917..07a5091 100644 --- a/frameworks/deepspeed/0.18.5/Dockerfile +++ b/frameworks/deepspeed/0.18.5/Dockerfile @@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \ evaluate \ ninja \ psutil \ + deepspeed-kernels \ py-cpuinfo \ pydantic \ hjson \ @@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \ "setuptools<82" \ wheel +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ MAX_JOBS=8 \ diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh index b2cde17..bc757f7 100644 --- a/frameworks/deepspeed/0.18.5/test.sh +++ b/frameworks/deepspeed/0.18.5/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") if [ "$VERSION" = "0.18.5" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile index fafa95a..d76a4d0 100644 --- a/frameworks/deepspeed/0.18.6/Dockerfile +++ b/frameworks/deepspeed/0.18.6/Dockerfile @@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \ evaluate \ ninja \ psutil \ + deepspeed-kernels \ py-cpuinfo \ pydantic \ hjson \ @@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \ "setuptools<82" \ wheel +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ MAX_JOBS=8 \ diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh index c2a69d6..92e48ca 100644 --- a/frameworks/deepspeed/0.18.6/test.sh +++ b/frameworks/deepspeed/0.18.6/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") if [ "$VERSION" = "0.18.6" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile index edfe910..8bfea71 100644 --- a/frameworks/deepspeed/0.18.7/Dockerfile +++ b/frameworks/deepspeed/0.18.7/Dockerfile @@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \ evaluate \ ninja \ psutil \ + deepspeed-kernels \ py-cpuinfo \ pydantic \ hjson \ @@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \ "setuptools<82" \ wheel +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ MAX_JOBS=8 \ diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh index 11a5177..16c9b65 100644 --- a/frameworks/deepspeed/0.18.7/test.sh +++ b/frameworks/deepspeed/0.18.7/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") if [ "$VERSION" = "0.18.7" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile index 8875191..a46adb2 100644 --- a/frameworks/deepspeed/0.18.8/Dockerfile +++ b/frameworks/deepspeed/0.18.8/Dockerfile @@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \ evaluate \ ninja \ psutil \ + deepspeed-kernels \ py-cpuinfo \ pydantic \ hjson \ @@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \ "setuptools<82" \ wheel +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ MAX_JOBS=8 \ diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh index ffce0af..8c8cc23 100644 --- a/frameworks/deepspeed/0.18.8/test.sh +++ b/frameworks/deepspeed/0.18.8/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") if [ "$VERSION" = "0.18.8" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile index 7aa8ce9..75aad1c 100644 --- a/frameworks/deepspeed/0.18.9/Dockerfile +++ b/frameworks/deepspeed/0.18.9/Dockerfile @@ -36,6 +36,7 @@ RUN pip3.11 install --no-cache-dir \ evaluate \ ninja \ psutil \ + deepspeed-kernels \ py-cpuinfo \ pydantic \ hjson \ @@ -43,6 +44,9 @@ RUN pip3.11 install --no-cache-dir \ "setuptools<82" \ wheel +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + # 编译安装 DeepSpeed RUN DS_BUILD_OPS=1 \ MAX_JOBS=8 \ diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh index cd9ee2d..bd3aae0 100644 --- a/frameworks/deepspeed/0.18.9/test.sh +++ b/frameworks/deepspeed/0.18.9/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") if [ "$VERSION" = "0.18.9" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" python3.11 -c " +docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" python3.11 -c " + docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F -- Gitee From 29dedaee645df9727f7c34ed4befd30e37c0d0ce Mon Sep 17 00:00:00 2001 From: Anstarc Date: Fri, 8 May 2026 17:57:22 +0800 Subject: [PATCH 6/6] =?UTF-8?q?=E6=B7=BB=E5=8A=A0CUDA=E5=BA=93=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E5=88=B0LD=5FLIBRARY=5FPATH?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frameworks/deepspeed/0.18.4/Dockerfile | 2 +- frameworks/deepspeed/0.18.4/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.5/Dockerfile | 2 +- frameworks/deepspeed/0.18.5/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.6/Dockerfile | 2 +- frameworks/deepspeed/0.18.6/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.7/Dockerfile | 2 +- frameworks/deepspeed/0.18.7/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.8/Dockerfile | 2 +- frameworks/deepspeed/0.18.8/test.sh | 26 +++++++++++++------------- frameworks/deepspeed/0.18.9/Dockerfile | 2 +- frameworks/deepspeed/0.18.9/test.sh | 26 +++++++++++++------------- 12 files changed, 84 insertions(+), 84 deletions(-) diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile index 4b35106..3f958b4 100644 --- a/frameworks/deepspeed/0.18.4/Dockerfile +++ b/frameworks/deepspeed/0.18.4/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh index 14516aa..a6f7c81 100644 --- a/frameworks/deepspeed/0.18.4/test.sh +++ b/frameworks/deepspeed/0.18.4/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") if [ "$VERSION" = "0.18.4" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile index 07a5091..bfc4238 100644 --- a/frameworks/deepspeed/0.18.5/Dockerfile +++ b/frameworks/deepspeed/0.18.5/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh index bc757f7..b2cde17 100644 --- a/frameworks/deepspeed/0.18.5/test.sh +++ b/frameworks/deepspeed/0.18.5/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") if [ "$VERSION" = "0.18.5" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile index d76a4d0..b93a2bf 100644 --- a/frameworks/deepspeed/0.18.6/Dockerfile +++ b/frameworks/deepspeed/0.18.6/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh index 92e48ca..c2a69d6 100644 --- a/frameworks/deepspeed/0.18.6/test.sh +++ b/frameworks/deepspeed/0.18.6/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") if [ "$VERSION" = "0.18.6" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile index 8bfea71..feb3dac 100644 --- a/frameworks/deepspeed/0.18.7/Dockerfile +++ b/frameworks/deepspeed/0.18.7/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh index 16c9b65..11a5177 100644 --- a/frameworks/deepspeed/0.18.7/test.sh +++ b/frameworks/deepspeed/0.18.7/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") if [ "$VERSION" = "0.18.7" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile index a46adb2..78c36cb 100644 --- a/frameworks/deepspeed/0.18.8/Dockerfile +++ b/frameworks/deepspeed/0.18.8/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh index 8c8cc23..ffce0af 100644 --- a/frameworks/deepspeed/0.18.8/test.sh +++ b/frameworks/deepspeed/0.18.8/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") if [ "$VERSION" = "0.18.8" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile index 75aad1c..1c3c236 100644 --- a/frameworks/deepspeed/0.18.9/Dockerfile +++ b/frameworks/deepspeed/0.18.9/Dockerfile @@ -17,7 +17,7 @@ RUN dnf install -y \ # 设置 MPI 环境变量 ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 RUN ln -s /usr/bin/python3.11 /usr/bin/python diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh index bd3aae0..cd9ee2d 100644 --- a/frameworks/deepspeed/0.18.9/test.sh +++ b/frameworks/deepspeed/0.18.9/test.sh @@ -21,7 +21,7 @@ fi # 测试 2: DeepSpeed 版本 echo -n "测试 2: DeepSpeed 版本... " -VERSION=$(docker run --rm "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:\$PATH && python3.11 -c 'import deepspeed; print(deepspeed.__version__)'") +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") if [ "$VERSION" = "0.18.9" ]; then echo "✓ (版本: $VERSION)" else @@ -57,7 +57,7 @@ fi # 测试 5: DeepSpeed 基础导入 echo -n "测试 5: DeepSpeed 基础导入... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator @@ -73,7 +73,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import deepspeed from deepspeed.accelerator import get_accelerator print(f'DeepSpeed version: {deepspeed.__version__}') @@ -85,7 +85,7 @@ fi # 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 echo -n "测试 6: 简单 CNN 模型初始化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -152,7 +152,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -186,7 +186,7 @@ fi # 测试 7: 训练步骤(前向+反向+优化) echo -n "测试 7: 训练步骤... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -248,7 +248,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -279,7 +279,7 @@ fi # 测试 8: ZeRO Stage 1 优化 echo -n "测试 8: ZeRO Stage 1 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -323,7 +323,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -350,7 +350,7 @@ fi # 测试 9: ZeRO Stage 2 优化 echo -n "测试 9: ZeRO Stage 2 优化... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -396,7 +396,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import deepspeed @@ -423,7 +423,7 @@ fi # 测试 10: Conv2d + FP16 混合精度训练 echo -n "测试 10: Conv2d + FP16 混合精度训练... " -docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " +docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F @@ -529,7 +529,7 @@ if [ $? -eq 0 ]; then echo "✓" else echo "✗" - docker run --rm --gpus all "$IMAGE" bash -c "export PATH=/usr/local/cuda/bin:$PATH && python3.11 -c " + docker run --rm --gpus all "$IMAGE" python3.11 -c " import torch import torch.nn as nn import torch.nn.functional as F -- Gitee