diff --git a/frameworks/deepspeed/0.18.4/Dockerfile b/frameworks/deepspeed/0.18.4/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..3f958b4d6dbaf8f3299051b5c252ab97dd360530 --- /dev/null +++ b/frameworks/deepspeed/0.18.4/Dockerfile @@ -0,0 +1,60 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.4 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + deepspeed-kernels \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py \ + "setuptools<82" \ + wheel + +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.4 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.4/README.md b/frameworks/deepspeed/0.18.4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9c6594f26b109fe526802a9665677aeb9d9a5b3c --- /dev/null +++ b/frameworks/deepspeed/0.18.4/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.4 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.4 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.4 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.4 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.4 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.4 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.4 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.4 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.4/build.conf b/frameworks/deepspeed/0.18.4/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..2c23f5bc039712daab91c3e7ec3d71067485881d --- /dev/null +++ b/frameworks/deepspeed/0.18.4/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.4 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.4 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.4/test.sh b/frameworks/deepspeed/0.18.4/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..a6f7c81e1f51f99ad5bf7248e54d9c8e71863c1e --- /dev/null +++ b/frameworks/deepspeed/0.18.4/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.4}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.4 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.4" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.4, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.5/Dockerfile b/frameworks/deepspeed/0.18.5/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..bfc4238e3526244d6b50157888d9f2406e12d858 --- /dev/null +++ b/frameworks/deepspeed/0.18.5/Dockerfile @@ -0,0 +1,60 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.5 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + deepspeed-kernels \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py \ + "setuptools<82" \ + wheel + +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.5 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.5/README.md b/frameworks/deepspeed/0.18.5/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5a4760c96dcbb00e1100821b5928332739dea605 --- /dev/null +++ b/frameworks/deepspeed/0.18.5/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.5 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.5 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.5 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.5 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.5 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.5 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.5 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.5 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.5/build.conf b/frameworks/deepspeed/0.18.5/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..c52c73493f31969074f9ed0a95de3b3aa86a59ed --- /dev/null +++ b/frameworks/deepspeed/0.18.5/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.5 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.5 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.5/test.sh b/frameworks/deepspeed/0.18.5/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..b2cde173face9763e2fc2dfe69e3026826f6380d --- /dev/null +++ b/frameworks/deepspeed/0.18.5/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.5}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.5 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.5" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.5, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.6/Dockerfile b/frameworks/deepspeed/0.18.6/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b93a2bfc6fc4f1f33ae53974f9140e1424c914a9 --- /dev/null +++ b/frameworks/deepspeed/0.18.6/Dockerfile @@ -0,0 +1,60 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.6 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + deepspeed-kernels \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py \ + "setuptools<82" \ + wheel + +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.6 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.6/README.md b/frameworks/deepspeed/0.18.6/README.md new file mode 100644 index 0000000000000000000000000000000000000000..09b10474155cd4da2b8e8a3c6a436dacc8594046 --- /dev/null +++ b/frameworks/deepspeed/0.18.6/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.6 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.6 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.6 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.6 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.6 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.6 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.6 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.6 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.6/build.conf b/frameworks/deepspeed/0.18.6/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..205926fb2bfdf39b9a3df1db7886ec9e79dd4623 --- /dev/null +++ b/frameworks/deepspeed/0.18.6/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.6 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.6 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.6/test.sh b/frameworks/deepspeed/0.18.6/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..c2a69d60e0dc1853497fadaba8c8802ed8d9f4a3 --- /dev/null +++ b/frameworks/deepspeed/0.18.6/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.6}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.6 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.6" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.6, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.7/Dockerfile b/frameworks/deepspeed/0.18.7/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..feb3dac45e7ee8d918e06c3f6056d94a5a88f57d --- /dev/null +++ b/frameworks/deepspeed/0.18.7/Dockerfile @@ -0,0 +1,60 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.7 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + deepspeed-kernels \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py \ + "setuptools<82" \ + wheel + +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.7 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.7/README.md b/frameworks/deepspeed/0.18.7/README.md new file mode 100644 index 0000000000000000000000000000000000000000..060c83a0115b677465fd85714b091edb8bde646d --- /dev/null +++ b/frameworks/deepspeed/0.18.7/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.7 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.7 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.7 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.7 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.7 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.7 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.7 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.7 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.7/build.conf b/frameworks/deepspeed/0.18.7/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..6cd5d09839deb85afa1a4e055223cb16ff7d5147 --- /dev/null +++ b/frameworks/deepspeed/0.18.7/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.7 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.7 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.7/test.sh b/frameworks/deepspeed/0.18.7/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..11a51778afc2118986952827044cf45a40110267 --- /dev/null +++ b/frameworks/deepspeed/0.18.7/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.7}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.7 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.7" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.7, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.8/Dockerfile b/frameworks/deepspeed/0.18.8/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..78c36cb07f0904d4e567d8d0f48ca543785efb57 --- /dev/null +++ b/frameworks/deepspeed/0.18.8/Dockerfile @@ -0,0 +1,60 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.8 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + deepspeed-kernels \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py \ + "setuptools<82" \ + wheel + +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.8 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.8/README.md b/frameworks/deepspeed/0.18.8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bf1f696cfb5b5628bf5c5e6cf6990690ae8f272b --- /dev/null +++ b/frameworks/deepspeed/0.18.8/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.8 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.8 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.8 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.8 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.8 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.8 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.8 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.8 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.8/build.conf b/frameworks/deepspeed/0.18.8/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..2426ade14acc147d3b5c4fa031aeb60cb9fb6baf --- /dev/null +++ b/frameworks/deepspeed/0.18.8/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.8 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.8 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.8/test.sh b/frameworks/deepspeed/0.18.8/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..ffce0af546c76445ef44053ab1ed45675ad20543 --- /dev/null +++ b/frameworks/deepspeed/0.18.8/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.8}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.8 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.8" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.8, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "==========================================" diff --git a/frameworks/deepspeed/0.18.9/Dockerfile b/frameworks/deepspeed/0.18.9/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..1c3c2366146bb50cb9742b3aea3abed71aea3b6a --- /dev/null +++ b/frameworks/deepspeed/0.18.9/Dockerfile @@ -0,0 +1,60 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="Anstarc " +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="DeepSpeed 0.18.9 GPU on OpenCloudOS 9" + +# 安装系统依赖 +RUN dnf install -y \ + python3.11 \ + python3.11-pip \ + git \ + wget \ + openmpi \ + openmpi-devel \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + +# 设置 MPI 环境变量 +ENV PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda/lib64 + +RUN ln -s /usr/bin/python3.11 /usr/bin/python + +# 设置工作目录 +WORKDIR /app + +# 安装 PyTorch(CUDA 12.8) +RUN pip3.11 install --no-cache-dir \ + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装 DeepSpeed 依赖 +RUN pip3.11 install --no-cache-dir \ + transformers \ + datasets \ + evaluate \ + ninja \ + psutil \ + deepspeed-kernels \ + py-cpuinfo \ + pydantic \ + hjson \ + mpi4py \ + "setuptools<82" \ + wheel + +# 设置 DeepSpeed 加速器环境变量 +ENV DS_ACCELERATOR=cuda + +# 编译安装 DeepSpeed +RUN DS_BUILD_OPS=1 \ + MAX_JOBS=8 \ + pip3.11 install deepspeed==0.18.9 --no-build-isolation + +# 设置 GPU 环境变量 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV CUDA_MODULE_LOADING=LAZY + +# 默认命令 +CMD ["python3.11", "-c", "import deepspeed; print(f'DeepSpeed {deepspeed.__version__} ready')"] diff --git a/frameworks/deepspeed/0.18.9/README.md b/frameworks/deepspeed/0.18.9/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0ef05da4acba0f16ebc879ede3d855ee687a46c5 --- /dev/null +++ b/frameworks/deepspeed/0.18.9/README.md @@ -0,0 +1,199 @@ +# DeepSpeed 0.18.9 GPU on OpenCloudOS 9 + +DeepSpeed 是微软开源的深度学习优化库,提供 ZeRO 内存优化、混合精度训练、流水线并行等功能,支持超大规模模型训练。 + +## 基本信息 + +- **基础镜像**: opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**: 3.11 +- **CUDA 版本**: 12.8 +- **DeepSpeed 版本**: 0.18.9 +- **PyTorch 版本**: 最新稳定版(CUDA 12.8) +- **MPI**: OpenMPI(支持多节点训练) + +## 构建 + +```bash +docker build -t oc9-deepspeed:0.18.9 . +``` + +## 测试 + +```bash +./test.sh oc9-deepspeed:0.18.9 +``` + +测试项包括: +- Python 和 CUDA 环境 +- DeepSpeed 版本验证 +- GPU 可用性检查 +- DeepSpeed 初始化 +- CNN 模型训练(CIFAR-10 风格) +- ZeRO Stage 1/2 优化 +- FP16 混合精度训练 + +## 使用示例 + +### 基本使用 + +```bash +# 查看版本信息 +docker run --rm oc9-deepspeed:0.18.9 python3.11 -c "import deepspeed; print(deepspeed.__version__)" + +# 查看可用设备 +docker run --rm --gpus all oc9-deepspeed:0.18.9 python3.11 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### 单 GPU 训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +# 定义模型 +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练循环 +for batch in dataloader: + inputs, labels = batch + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### FP16 混合精度训练 + +```python +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) + +# FP16 配置 +ds_config = { + 'train_batch_size': 16, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型 +target_dtype = torch.half if model_engine.fp16_enabled() else None + +# 训练时转换输入数据类型 +for batch in dataloader: + inputs, labels = batch + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + model_engine.backward(loss) + model_engine.step() +``` + +### 多 GPU 训练(单节点) + +```bash +# 使用 deepspeed launcher +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \ + deepspeed --num_gpus=2 train.py --deepspeed_config ds_config.json + +# 或使用 torchrun +docker run --rm --gpus all -v $(pwd):/workspace oc9-deepspeed:0.18.9 \ + torchrun --nproc_per_node=2 train.py +``` + +### ZeRO Stage 2 优化 + +```python +ds_config = { + 'train_batch_size': 32, + 'train_micro_batch_size_per_gpu': 4, + 'gradient_accumulation_steps': 2, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.9, 0.999], + 'eps': 1e-8 + } + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} +``` + +### 交互式使用 + +```bash +docker run --rm -it --gpus all oc9-deepspeed:0.18.9 bash +``` + +## 支持的功能 + +- **ZeRO 优化**: Stage 1/2/3 内存优化,支持超大模型训练 +- **混合精度**: FP16, BF16 自动混合精度训练 +- **梯度累积**: 模拟大 batch size 训练 +- **流水线并行**: 模型层间并行 +- **张量并行**: 层内并行 +- **CPU Offload**: 将优化器状态卸载到 CPU +- **梯度检查点**: 减少显存占用 +- **分布式训练**: 单机多卡、多机多卡 + +## 系统要求 + +- **GPU**: NVIDIA GPU with CUDA 12.x support +- **显存**: 建议 16GB+ (取决于模型大小和 ZeRO stage) +- **Docker**: 19.03+ with nvidia-docker2 +- **MPI**: 已包含 OpenMPI(多节点训练需要) + +## 参考资源 + +- [DeepSpeed 官方文档](https://www.deepspeed.ai/) +- [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed) +- [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples) +- [ZeRO 论文](https://arxiv.org/abs/1910.02054) diff --git a/frameworks/deepspeed/0.18.9/build.conf b/frameworks/deepspeed/0.18.9/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..e3f36364ef81ddac9c988fd53bf7de8c34b25814 --- /dev/null +++ b/frameworks/deepspeed/0.18.9/build.conf @@ -0,0 +1,4 @@ +# DeepSpeed 0.18.9 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-deepspeed +IMAGE_TAG=0.18.9 +GPU_TEST=true diff --git a/frameworks/deepspeed/0.18.9/test.sh b/frameworks/deepspeed/0.18.9/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..cd9ee2dc0d1beb4a3ae0cefd4cbeb0b92d005d60 --- /dev/null +++ b/frameworks/deepspeed/0.18.9/test.sh @@ -0,0 +1,602 @@ +#!/bin/bash +set -e + +IMAGE="${1:-oc9-deepspeed:0.18.9}" + +echo "==========================================" +echo "测试 DeepSpeed 0.18.9 GPU 镜像" +echo "镜像: $IMAGE" +echo "==========================================" + +# 测试 1: Python 环境 +echo -n "测试 1: Python 环境... " +docker run --rm "$IMAGE" python3.11 --version > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 --version + exit 1 +fi + +# 测试 2: DeepSpeed 版本 +echo -n "测试 2: DeepSpeed 版本... " +VERSION=$(docker run --rm "$IMAGE" python3.11 -c "import deepspeed; print(deepspeed.__version__)") +if [ "$VERSION" = "0.18.9" ]; then + echo "✓ (版本: $VERSION)" +else + echo "✗ (期望: 0.18.9, 实际: $VERSION)" + exit 1 +fi + +# 测试 3: 核心依赖 +echo -n "测试 3: 核心依赖 (torch, transformers)... " +docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm "$IMAGE" python3.11 -c "import torch; import transformers" + exit 1 +fi + +# 测试 4: CUDA 环境 +echo -n "测试 4: CUDA 环境... " +GPU_INFO=$(docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print(torch.cuda.get_device_name(0)) +") +if [ $? -eq 0 ]; then + echo "✓" + echo " GPU: $GPU_INFO" +else + echo "✗" + exit 1 +fi + +# 测试 5: DeepSpeed 基础导入 +echo -n "测试 5: DeepSpeed 基础导入... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator + +# 验证核心模块可用 +assert hasattr(deepspeed, 'initialize'), 'DeepSpeed initialize not found' +assert hasattr(deepspeed, 'init_distributed'), 'DeepSpeed init_distributed not found' + +# 验证加速器 +accelerator = get_accelerator() +assert accelerator is not None, 'Accelerator not available' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import deepspeed +from deepspeed.accelerator import get_accelerator +print(f'DeepSpeed version: {deepspeed.__version__}') +accelerator = get_accelerator() +print(f'Accelerator: {accelerator}') +" + exit 1 +fi + +# 测试 6: 简单 CNN 模型 + DeepSpeed 初始化 +echo -n "测试 6: 简单 CNN 模型初始化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +# 创建简单 CNN 模型(类似 CIFAR-10 示例) +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +# DeepSpeed 配置 +ds_config = { + 'train_batch_size': 4, + 'gradient_accumulation_steps': 1, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'zero_optimization': { + 'stage': 0 + } +} + +# 初始化 DeepSpeed +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +assert model_engine is not None, 'Model engine initialization failed' +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(10, 2) + def forward(self, x): + return self.fc(x) + +model = SimpleNet() +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('Model initialized successfully') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 7: 训练步骤(前向+反向+优化) +echo -n "测试 7: 训练步骤... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +class SimpleNet(nn.Module): + def __init__(self): + super(SimpleNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = SimpleNet() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 模拟训练步骤 +for step in range(3): + # 创建假数据 (batch_size=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(10, 2) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 0} +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +inputs = torch.randn(4, 10).to(model_engine.device) +labels = torch.randint(0, 2, (4,)).to(model_engine.device) +outputs = model_engine(inputs) +loss = torch.nn.functional.cross_entropy(outputs, labels) +model_engine.backward(loss) +model_engine.step() +print('Training step completed') +" + exit 1 +fi + +# 测试 8: ZeRO Stage 1 优化 +echo -n "测试 8: ZeRO Stage 1 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 1 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 1, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 简单训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 1} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 1 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 9: ZeRO Stage 2 优化 +echo -n "测试 9: ZeRO Stage 2 优化... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 10) +) + +# ZeRO Stage 2 配置 +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': {'lr': 0.001} + }, + 'zero_optimization': { + 'stage': 2, + 'allgather_partitions': True, + 'reduce_scatter': True, + 'allgather_bucket_size': 50000000, + 'reduce_bucket_size': 50000000, + 'overlap_comm': True, + 'contiguous_gradients': True + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 训练步骤 +inputs = torch.randn(4, 100).to(model_engine.device) +outputs = model_engine(inputs) +loss = outputs.sum() +model_engine.backward(loss) +model_engine.step() +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import deepspeed + +model = nn.Linear(100, 10) +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'zero_optimization': {'stage': 2} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + print('ZeRO Stage 2 initialized') +except Exception as e: + print(f'Error: {e}') +" + exit 1 +fi + +# 测试 10: Conv2d + FP16 混合精度训练 +echo -n "测试 10: Conv2d + FP16 混合精度训练... " +docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# CIFAR-10 风格的 CNN 模型 +class CifarNet(nn.Module): + def __init__(self): + super(CifarNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + +model = CifarNet() + +# FP16 配置(参考 DeepSpeed CIFAR 示例) +ds_config = { + 'train_batch_size': 4, + 'optimizer': { + 'type': 'Adam', + 'params': { + 'lr': 0.001, + 'betas': [0.8, 0.999], + 'eps': 1e-8, + 'weight_decay': 3e-7 + } + }, + 'scheduler': { + 'type': 'WarmupLR', + 'params': { + 'warmup_min_lr': 0, + 'warmup_max_lr': 0.001, + 'warmup_num_steps': 100 + } + }, + 'gradient_clipping': 1.0, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 15 + }, + 'zero_optimization': { + 'stage': 0 + } +} + +model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config +) + +# 获取目标数据类型(关键!参考官方 CIFAR 示例) +target_dtype = None +if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 +elif model_engine.fp16_enabled(): + target_dtype = torch.half + +# 验证 FP16 已启用 +assert model_engine.fp16_enabled(), 'FP16 not enabled' +assert target_dtype == torch.half, 'target_dtype should be torch.half' + +# 训练循环 +model_engine.train() + +for step in range(3): + # 创建 CIFAR-10 风格的输入 (batch=4, channels=3, height=32, width=32) + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + # 关键:手动转换输入数据类型为 FP16(参考官方示例) + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + # 前向传播 + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + + # 反向传播 + model_engine.backward(loss) + + # 优化器步骤 + model_engine.step() + +" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓" +else + echo "✗" + docker run --rm --gpus all "$IMAGE" python3.11 -c " +import torch +import torch.nn as nn +import torch.nn.functional as F +import deepspeed + +# 简化的 CNN 模型用于调试 +class SimpleConv(nn.Module): + def __init__(self): + super(SimpleConv, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.fc1 = nn.Linear(6 * 28 * 28, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = x.view(-1, 6 * 28 * 28) + x = self.fc1(x) + return x + +model = SimpleConv() + +ds_config = { + 'train_batch_size': 4, + 'optimizer': {'type': 'Adam', 'params': {'lr': 0.001}}, + 'fp16': { + 'enabled': True, + 'loss_scale': 0, + 'initial_scale_power': 15 + }, + 'zero_optimization': {'stage': 0} +} + +try: + model_engine, optimizer, _, _ = deepspeed.initialize( + model=model, + model_parameters=model.parameters(), + config=ds_config + ) + + # 获取目标数据类型 + target_dtype = None + if model_engine.fp16_enabled(): + target_dtype = torch.half + + model_engine.train() + + # 训练步骤 + inputs = torch.randn(4, 3, 32, 32).to(model_engine.device) + labels = torch.randint(0, 10, (4,)).to(model_engine.device) + + if target_dtype is not None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = F.cross_entropy(outputs, labels) + model_engine.backward(loss) + model_engine.step() + + print(f'FP16 enabled: {model_engine.fp16_enabled()}') +except Exception as e: + print(f'Error: {e}') + import traceback + traceback.print_exc() +" + exit 1 +fi + +echo "" +echo "==========================================" +echo "所有测试通过!" +echo "=========================================="