diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4d70f070b7a9bbcc528b2ad811d83412765de22a --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.vscode +.idea +build +output +__pycache__ +*.log diff --git a/CMakeLists.txt b/CMakeLists.txt index c45e7ae54859247e2cfd1b5fe90bfa8f0df662d1..e1fe63f618e1a608d63220f1ed14c4f66ab15a26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,8 @@ project(hccl) option(BUILD_OPEN_PROJECT "Build open hccl project." ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + if(BUILD_OPEN_PROJECT) include(cmake/config.cmake) add_subdirectory(src/domain/collective_communication) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f853e97d7b90c9468c71bd48c99c1950b7821892 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,141 @@ +# NOTE: Building this image requires docker version >= 18.0 + +ARG BASE_IMAGE=ubuntu:22.04 + +FROM ${BASE_IMAGE} AS official + +ARG TARGETPLATFORM=linux/arm64 + +ENV USER_PASSWD=change_me + +SHELL [ "/bin/bash", "-c" ] + +RUN cp /etc/apt/sources.list /etc/apt/sources.list.backup && \ + case ${TARGETPLATFORM} in \ + "linux/arm64") sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list ;; \ + *) sed -i 's|archive.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list ;; \ + esac + +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + apt-transport-https \ + ca-certificates \ + build-essential \ + bash \ + curl \ + git \ + wget \ + gcc \ + g++ \ + make \ + cmake \ + python3 \ + python3-pip \ + gdb \ + vim \ + file \ + man \ + sudo \ + zlib1g \ + openssl \ + unzip \ + pciutils \ + net-tools \ + gfortran \ + patchelf \ + libblas3 \ + libblas-dev \ + libssl-dev \ + zlib1g-dev \ + libncurses5-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + libffi-dev \ + libnss3-dev \ + libgdbm-dev \ + liblzma-dev \ + libev-dev \ + openssh-server \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /var/tmp/* \ + && rm -rf /tmp/* + +# 创建 hccl 用户 +RUN groupadd -g 1000 hcclgroup && \ + useradd -u 1000 -g hcclgroup -ms /bin/bash hccluser && \ + usermod -aG sudo hccluser && \ + echo "hccluser ALL=(ALL) NOPASSWD:/usr/bin/apt-get,/usr/bin/apt" >> /etc/sudoers + +USER hccluser + +RUN pip install --no-cache-dir -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \ + attrs cython numpy==1.24.0 decorator sympy cffi pyyaml pathlib2 \ + psutil protobuf==3.20 scipy requests absl-py + +# 安装 CANN 8.2.RC1.alpha003 +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") ARCH=aarch64 ;; \ + *) ARCH=x86_64 ;; \ + esac && \ + CANN_TOOLKIT_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-toolkit_8.2.RC1.alpha003_linux-${ARCH}.run" && \ + curl -fsSL -o /tmp/Ascend-cann-toolkit.run -O "${CANN_TOOLKIT_URL}" && \ + CANN_COMMUNITY_SDK_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-communitysdk_8.2.RC1.alpha003_linux-${ARCH}.run" && \ + curl -fsSL -o /tmp/Ascend-cann-communitysdk.run -O "${CANN_COMMUNITY_SDK_URL}" + +RUN chmod +x /tmp/Ascend-cann-toolkit.run && \ + /tmp/Ascend-cann-toolkit.run --quiet --install && \ + rm /tmp/Ascend-cann-toolkit.run + +RUN chmod +x /tmp/Ascend-cann-communitysdk.run && \ + /tmp/Ascend-cann-communitysdk.run --quiet --full && \ + rm /tmp/Ascend-cann-communitysdk.run + +# 安装 HCCL 依赖 +RUN curl -fsSL -o /tmp/include.zip -O https://github.com/nlohmann/json/releases/download/v3.11.2/include.zip && \ + unzip -d ${HOME}/nlohmann_json /tmp/include.zip && \ + rm /tmp/include.zip + +RUN curl -fsSL -o /tmp/mpich.tar.gz -O https://www.mpich.org/static/downloads/3.2.1/mpich-3.2.1.tar.gz && \ + tar -zxf /tmp/mpich.tar.gz -C /tmp && \ + cd /tmp/mpich-3.2.1 && \ + ./configure --disable-fortran --prefix=${HOME}/mpich --with-device=ch3:nemesis && \ + make && make install && \ + rm -r /tmp/mpich-3.2.1 && \ + rm /tmp/mpich.tar.gz + +# 设置环境变量 +RUN \ + # NPU 驱动环境变量 + echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc && \ + echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc && \ + # CANN Toolkit 环境变量 + echo 'source ${HOME}/Ascend/ascend-toolkit/set_env.sh' >> ${HOME}/.bashrc && \ + # MPICH 环境变量 + echo 'export PATH=${HOME}/mpich/bin:${PATH}' >> ${HOME}/.bashrc && \ + echo 'export LD_LIBRARY_PATH=${HOME}/mpich/lib:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc + +USER root + +# SSH 配置 +RUN echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \ + echo "PermitUserEnvironment yes" >> /etc/ssh/sshd_config && \ + echo "ClientAliveInterval 60" >> /etc/ssh/sshd_config && \ + echo "ClientAliveCountMax 3" >> /etc/ssh/sshd_config && \ + echo "AllowUsers hccluser" >> /etc/ssh/sshd_config + +# SSH 启动脚本 +RUN echo '#!/bin/bash' > /start.sh && \ + echo 'if [ -n "${USER_PASSWD}" ]; then' >> /start.sh && \ + echo ' echo "hccluser:${USER_PASSWD}" | chpasswd' >> /start.sh && \ + echo 'fi' >> /start.sh && \ + echo 'mkdir -p /var/run/sshd' >> /start.sh && \ + echo 'ssh-keygen -A' >> /start.sh && \ + echo '/usr/sbin/sshd -D -e' >> /start.sh && \ + chmod +x /start.sh + +EXPOSE 22 + +CMD [ "/start.sh" ] diff --git a/README.md b/README.md index 6f5894402c8cb296cf9c92b00523b27f93454b52..04bb336e9e2ec9dd42354e2eb6df11f21ddbbc98 100644 --- a/README.md +++ b/README.md @@ -232,7 +232,7 @@ HCCL软件包安装完成后,开发者可通过HCCL Test工具进行集合通 ## 相关文档 -HCCL提供了使用指南、环境变量参考、基于本源码仓进行定制的开发指南、算法分析工具使用指导等,详细可参见[HCCL资料书架总览](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88)。 +HCCL提供了用户指南、环境变量参考、基于源码仓进行算法与算子定制的开发指南等,详细可参见[HCCL资料书架总览](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88)。 ## 贡献指南 diff --git a/contest.md b/contest.md new file mode 100644 index 0000000000000000000000000000000000000000..3135574fcf199f3e028f91fe4d4a5dd6c48d1b06 --- /dev/null +++ b/contest.md @@ -0,0 +1,364 @@ +# HCCL 通信库创新大赛操作指导 + +## 0. 赛前须知 + +### 0.0 决赛题目 + +![image](img/final_round_question.jpg) + +- 由于0-1卡间已断链,则无需返回此条链路间的建链请求 + +### 0.1 技能要求 + +1. 熟悉 C++14 编程语言 +2. 了解 GDB、LLDB 等调试工具 +3. 了解 VSCode、CLion 等 IDE 开发工具 +4. 了解 AllReduce 等集合通信原语 + +### 0.2 资料 + +HCCL 资料: + +- [昇腾社区官网][1] +- [HCCL主页——昇腾社区][2] +- [HCCL概述——昇腾社区][3] +- [集合通信原语——昇腾社区][4] +- [HCCL代码仓][5] +- [HCCL Wiki][6] + +定制算法开发指南: + +1. [HCCL源码定制开发指南][7] +2. [AllGather 定制算法实现][8] +3. [HCCL 通信库创新大赛参赛 FAQ](./faq.md) + +[1]: https://www.hiascend.com +[2]: https://www.hiascend.com/hccl +[3]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/hccl/hcclug/hcclug_000001.html +[4]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/hccl/hcclug/hcclug_000004.html +[5]: https://gitee.com/ascend/cann-hccl +[6]: https://gitee.com/ascend/cann-hccl/wikis +[7]: https://gitee.com/ascend/cann-hccl/blob/master/docs/hccl_customized_dev/README.md +[8]: https://gitee.com/ascend/cann-hccl/pulls/64 + +### 0.3 评分标准 + +组委会将从功能、性能、代码风格 3 个维度对参赛代码进行综合评测,评测公式: + +- 15 分功能分:15 个算法分析器用例,每个 1 分,通过得 1 分,不通过得 0 分 + + > 5 种数据量:1k/1m/64m/1g/4g,3 种数据格式:int8/fp16/fp32 + +- 75 分性能分:3 个 HCCLTest 用例,每个 25 分,不通过得 0 分,通过则按照性能计分,性能最佳得满分,按照排名依次递减 + + > 3 种数据量:1k/1m/1g,1 种数据格式:fp32 + > + > 性能标准:基于 HCCLTest 工具测试的带宽使用量(字段:`alg_bandwidth(GB/s)`)作为评判标准,数值越高越好 + +- 10 分主观分:代码风格 + +> 【注意】验证方法详见 [算法分析器验证](#71-算法分析器验证)、[HCCLTest工具验证](#72-hccltest-工具验证) + +## 1. 登录环境 + +选手开发环境信息将通过邮件的方式发送至队长邮箱,队伍成员可通过 SSH 进入选手开发环境: + +```bash +ssh root@ip -p port +``` + +## 2. 环境目录 + +选手开发环境是运行在物理机上的 Docker 容器,目录结构如下: + +``` +|-- /dev +| |-- davinci0 # NPU1 +| |-- davinci1 # NPU2 +| |-- davinci2 # NPU3 +| `-- davinci3 # NPU4 +| |-- davinci4 # NPU5 +| |-- davinci5 # NPU6 +| |-- davinci6 # NPU7 +| `-- davinci7 # NPU8 +|-- /usr/local/Ascend +| `-- driver # NPU 驱动安装目录 +|-- /home/hccluser/Ascend +| |-- ascend-toolkit # CANN Toolkit 安装目录 +| `-- ascend_cann_install.info # CANN 安装信息 +`-- /home/hccluser + |-- cann-hccl # HCCL 代码仓(选手需自行下载) + |-- mpich # MPICH 安装目录 + `-- nlohmann_json # nlohmann json inclue 目录 +``` + +## 3. 软件版本 + +> 【注意】 +> +> 1. 选手开发环境中已安装下列软件依赖 +> 2. 最终评测环境的软件版本与选手开发环境一致 + +- gcc 11.4.0 +- g++ 11.4.0 +- make 4.3 +- cmake 3.22.1 +- mpich 3.2.1 +- CANN Toolkit 8.2.RC1.alpha003 +- CANN Community SDK 8.2.RC1.alpha003 + +## 4. 代码开发 + +### 4.1 下载代码 + +> 【注意】选手只需下载 [ascend/cann-hccl](https://gitee.com/ascend/cann-hccl.git) 代码仓即可,编译运行所需全部依赖已提前安装 + +```bash +cd /home/hccluser + +git clone https://gitee.com/ascend/cann-hccl.git -b r1.5.2 +``` + +### 4.2 IDE 远程开发 + +推荐选手基于 VSCode、CLion 等 IDE,通过 SSH 连接开发环境进行远程开发,参考文档: + +- [VSCode 使用 SSH 远程开发](https://code.visualstudio.com/docs/remote/ssh) +- [CLion 使用 SSH 远程开发](https://www.jetbrains.com/help/clion/remote-development.html) + +### 4.3 定制算法开发 + +在 HCCL 软件架构中,`Operator` 负责算法选择,`Exeutor` 负责算法编排。为简化流程,选手只需实现以下内容: + +1. [custom_all_reduce_operator.cc](src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc) 中编写算法选择逻辑 +2. [coll_custom_small_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc) 中编写小数据量(1K)场景的 AllReduce 算法 +3. [coll_custom_medium_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc) 中编写中等数据量(1M)场景的 AllReduce 算法 +4. [coll_custom_huge_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc) 中编写大数据量(1G)场景的 AllReduce 算法 + +> 【注意】上述代码文件中,选手需要实现的内容已在代码注释中标明 + +## 5. 编译代码 + +编译所需的依赖项均已安装,在 HCCL 代码仓执行编译即可: + +```bash +cd /home/hccluser/cann-hccl + +bash build.sh --nlohmann_path /home/hccluser/nlohmann_json/include +``` + +## 6. 安装编译结果 + +编译生成的 HCCL 软件包在 `/home/hccluser/cann-hccl/output` 目录下: + +```bash +cd /home/hccluser/cann-hccl/output + +./CANN-hccl_alg-8.2.t12.0.b077-linux.aarch64.run +``` + +安装完成后,用户编译生成的 HCCL 软件包会替换已安装 CANN 开发套件包中的 HCCL 相关软件 + +## 7. 测试代码 + +> 【注意】选手可使用评测脚本进行验证: + +```bash +cd /home/hccluser/cann-hccl + +# 查看使用方法(脚本作用:解析测试工具输出的字符串) +python3 eval.py --help +# 执行算法分析器用例 +python3 eval.py --llt +# 执行 HCCLTest 工具用例(3 种数据量的用例各执行 10 次,每次执行间隔 5s) +python3 eval.py --hccltest -n 10 -i 5 +``` + +### 7.1 算法分析器验证 + +> 【注意】算法分析器能够在无昇腾 NPU 场景下离线测试算法逻辑,包括:死锁检测、资源校验、内存冲突校验等 + +编译并执行算法分析器用例: + +```bash +cd /home/hccluser/cann-hccl + +# 编译测试用例,并自动执行 +bash build.sh --nlohmann_path /home/hccluser/nlohmann_json/include --test --open_hccl_test + +# 手动执行测试用例 +export BUILD_TEST_DIR="/home/hccluser/cann-hccl/build/test/" +export LD_LIBRARY_PATH="${BUILD_TEST_DIR}:${LD_LIBRARY_PATH}" +./build/test/open_hccl_test +``` + +### 7.2 HCCLTest 工具验证 + +> 【注意】性能测试场景可使用 HCCL Test 工具进行验证,该工具基于真实 NPU 设备进行功能和性能测试 + +基于 HCCL Test 工具在 NPU 设备上执行验证: + +```bash +cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test + +# 编译 HCCL 性能测试工具 +make MPI_HOME=/home/hccluser/mpich ASCEND_DIR=/home/hccluser/Ascend/ascend-toolkit/latest + +# 执行 HCCL Test +# 1K +mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 1k -e 1k -d fp32 -o sum -p 4 -w 100 -n 500 +# 1M +mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 1m -e 1m -d fp32 -o sum -p 4 -w 100 -n 500 +# 1G +mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 1g -e 1g -d fp32 -o sum -p 4 -w 100 -n 500 +``` + +各参数解释如下,详细说明可参考:[昇腾文档中心-HCCL 性能测试工具使用指南][9] + +[9]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/devaids/hccltool/HCCLpertest_16_0001.html + +```bash +mpirun -np 8 \ # MPI 进程数量 + taskset -c 0,2,4,6,8,10,12,14\ # 将 MPI 进程绑定到 0,2,4,6,8,10,12,14 CPU 核(设置 CPU 亲和性,避免操作系统调度干扰,降低波动) + ./bin/all_reduce_test \ # 可执行文件路径 + -b 1k \ # 测试数据大小的最小值,单位:Byte + -e 1k \ # 测试数据大小的最大值,单位:Byte + -d fp32 \ # 测试数据的数据类型 + -o sum \ # Reduce 操作类型 + -p 8 \ # NPU 数量 + -w 100 \ # 预热迭代次数,不计入性能统计 + -n 500 # 迭代次数 +``` + +> 【注意】赛事工作组评测选手代码时会执行 10 次上述命令,取带宽的均值作为性能得分 + +### 7.3 使用 Profiling 工具分析程序性能 + +> 【注意】开启 profiling 后性能会有所下降 + +1. 生成 profiling 数据 + +```bash +# 开启 Profiling 开关 +export HCCL_TEST_PROFILING=1 +export HCCL_TEST_PROFILING_PATH=/home/hccluser/prof + +# 执行 HCCLTest 用例 +# 会在 /home/hccluser/prof 目录下生成 4 个文件夹,对应每张 NPU 卡 +cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test +mpirun -np 8 taskset -c 0,2,4,6,8,10,12,14 ./bin/all_reduce_test -b 1k -e 1k -d fp32 -o sum -p 8 -w 100 -n 500 + +# 导出 Profiling 结果 +cd /home/hccluser/prof +msprof --export=on --output=./ + +# 把每张 NPU 的 Profiling 结果复制到 timeline 目录,包含 8 个 json 文件 +mkdir -p timeline +cp -i PROF*/mindstudio_profiler_output/msprof*.json timeline/ +``` + +2. 复制 profiling 结果到本地 + +在选手本地 PC 终端中使用 `scp` 命令将 profiling 结果复制到本地桌面: + +```bash +scp -P PORT hccluser@IP:/home/hccluser/prof/timeline/*.json ~/Desktop +``` + +3. 使用 Chrome 浏览器打开 profiling 结果 + +浏览器打开:`chrome://tracing`,将 json 文件拖拽到浏览器中,即可打开 + +使用方法:通过键盘上的快捷键(w:放大,s:缩小,a:左移,d:右移)进行查看 + +## 8. 提交代码 + +执行下列脚本,将选手代码拷贝到 `/result` 目录下 + +```bash +cd /home/hccluser/cann-hccl + +bash submit.sh +``` + +该脚本将选手编写的定制算法文件拷贝至 `/result` 目录下,用于后续评测: + +1. `custom_all_reduce_operator.h` +2. `custom_all_reduce_operator.cc` +3. `coll_custom_small_all_reduce_mesh_executor.h` +4. `coll_custom_small_all_reduce_mesh_executor.cc` +5. `coll_custom_medium_all_reduce_mesh_executor.h` +6. `coll_custom_medium_all_reduce_mesh_executor.cc` +7. `coll_custom_huge_all_reduce_mesh_executor.h` +8. `coll_custom_huge_all_reduce_mesh_executor.cc` + +## 9. 结果公布 + +赛程结束后统一公布成绩 + +> 【注意】选手开发环境与最终评测环境完全一致 + +## 10. 调试代码 + +### 10.1 日志 + +#### 10.1.1 日志打印 + +选手可通过调用日志宏保存日志到文件中,便于调试: + +```c++ +HCCL_DEBUG("[HCCL_CONTEST] Orchestrate start"); +HCCL_INFO("[HCCL_CONTEST] Total count: %u", totalCount); +HCCL_WARNING("[HCCL_CONTEST] Cost: %u ms", cost); +``` + +#### 10.1.2 日志设置 + +1. 日志级别 + +HCCL 日志级别默认为 Error,下面通过环境变量设置为 Info 级别: + +```bash +export ASCEND_GLOBAL_LOG_LEVEL=1 # 0: debug, 1: info, 2: warn, 3: error +``` + +2. 日志目录 + +设置日志存储目录: + +```bash +export ASCEND_PROCESS_LOG_PATH=/home/hccluser/log # 默认为:$HOME/ascend/log +``` + +设置日志输出到控制台: + +```bash +export ASCEND_SLOG_PRINT_TO_STDOUT=1 +``` + +3. 日志数量 + +设置每个进程最多保留的日志数量为较大数字,以防丢失: + +```bash +export ASCEND_HOST_LOG_FILE_NUM=1000 +``` + +### 10.2 Core dump 问题 + +使用 gdb 调试: + +> 【注意】编译算法分析器依赖的 HCCL 代码时默认已开启 `-O0 -g` 编译选项 + +```bash +cd /home/hccluser/cann-hccl + +# 基于算法分析器调试 HCCL 定制算法 +export BUILD_TEST_DIR="/home/hccluser/cann-hccl/build/test/" +export LD_LIBRARY_PATH="${BUILD_TEST_DIR}:${LD_LIBRARY_PATH}" +gdb --args ./build/test/open_hccl_test +``` + +### 10.3 Wrong answer 问题 + +请选手仔细排查定制算法是否符合 AllReduce 算法逻辑 diff --git a/eval.py b/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..5b85020b986e6e1624a5916caadf597c14061d3a --- /dev/null +++ b/eval.py @@ -0,0 +1,264 @@ +import argparse +import subprocess +import csv +import time +import math +import os +import logging +import re + +from typing import List, Optional, Union, Dict, Tuple + + +# 日志 +logger = logging.getLogger("hccl_eval_logger") +logger.setLevel(logging.DEBUG) +# 日志文件打印 +file_fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") +file_handler = logging.FileHandler("hccl_contest_eval.log") +file_handler.setLevel(logging.DEBUG) +file_handler.setFormatter(file_fmt) +# 控制台打印 +console_fmt = logging.Formatter("%(message)s") +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) +console_handler.setFormatter(console_fmt) +logger.addHandler(file_handler) +logger.addHandler(console_handler) + +cmd_t = Union[List[str], str] + +ascend_home_path: str = os.getenv("ASCEND_HOME_PATH", default="") + + +def exec( + cmd: cmd_t, + /, + pwd: Optional[str] = None, + env: Optional[Dict[str, str]] = None, +) -> Tuple[int, str, str]: + """执行命令并获取输出""" + result = subprocess.run( + cmd, + cwd=pwd, + env=env, + shell=True, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + return result.returncode, result.stdout, result.stderr + + +class HcclTestResult: + data_size: int = 0 # 数据大小(Bytes) + aveg_time: float = 0.0 # 平均时间(us) + alg_bandwidth: float = 0.0 # 算法带宽(GB/s) + check_result: str = "failed" # 检查结果 + + @property + def headers(self) -> List[str]: + return [ + "data_size(Bytes)", + "aveg_time(us)", + "alg_bandwidth(GB/s)", + "check_result", + ] + + def __str__(self): + return f"alg_bandwidth: {self.alg_bandwidth}, check_result: {self.check_result}" + + @classmethod + def parse(cls, output: str): + """ + 解析 HCCLTest 输出结果 + + 结果正确输出样例: + + $ mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500 + the minbytes is 524288, maxbytes is 524288, iters is 500, warmup_iters is 100 + data_size(Bytes): | aveg_time(us): | alg_bandwidth(GB/s): | check_result: + 524288 | 102.29 | 5.12530 | success + + 结果错误输出样例: + + $ mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 -w 100 -n 500 + the minbytes is 67108864, maxbytes is 67108864, iters is 500, warmup_iters is 100 + check buf[14783552] error, exp:8.000000, act:6.000000 + total err is 192 + rank id 0, check result failed, 67108864 | 3665.90 | 18.30623 | failed + data_size(Bytes): | aveg_time(us): | alg_bandwidth(GB/s): | check_result: + 67108864 | 3665.90 | 18.30623 | failed + """ + + headers = [ + "data_size(Bytes)", + "aveg_time(us)", + "alg_bandwidth(GB/s)", + "check_result", + ] + + lines = output.splitlines() + test_rst = HcclTestResult() + + def parse_line(line: str) -> HcclTestResult: + parts = [p.strip() for p in line.split("|")] + try: + rst = HcclTestResult() + rst.data_size = int(parts[0]) + rst.aveg_time = float(parts[1]) + rst.alg_bandwidth = float(parts[2]) + rst.check_result = parts[3] + except (ValueError, IndexError) as e: + logger.error("Failed to parse: %s", line) + logger.exception("Error: %s", e) + raise e + return rst + + for idx, line in enumerate(lines): + # 标题行 + if all(header in line for header in headers): + # 解析标题行的下一行 + assert idx < len(line) + test_rst = parse_line(lines[idx + 1]) + + # 结果错误,带宽设为 0,不得分 + failed_pos = line.find("check result failed") + if failed_pos >= 0: + logger.debug("Check result failed") + # 解析错误行结果 + last_comma_pos = line.find(",", failed_pos) + test_rst = parse_line(line[last_comma_pos + 1 :]) + test_rst.alg_bandwidth = 0.0 + return test_rst + + return test_rst + + +def eval_hccl_test( + *, + npus: int = 4, + iters: int = 10, + interval: int = 5, +): + """ + 评测 HCCLTest + + 分别执行 3 种数据量 10 次,取带宽均值: + mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500 + mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 -w 100 -n 500 + mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 -w 100 -n 500 + """ + + data_sizes = ["512k", "2m", "64m"] + pwd = os.path.join(ascend_home_path, "tools", "hccl_test") + + # 3 种数据量 + for size in data_sizes: + cores = ",".join(str(2 * i) for i in range(npus)) + cmd = f"mpirun -np {npus} taskset -c {cores} ./bin/all_reduce_test -b {size} -e {size} -d fp32 -o sum -p {npus} -w 100 -n 500" + + # 跑 10 次测试 + results: List[HcclTestResult] = [] + for i in range(iters): + logger.debug("[%s][%d/%d] Evaluating with cmd: %s", size, i + 1, iters, cmd) + # 执行命令 + _, output, _ = exec(cmd, pwd=pwd) + logger.debug("[%s][%d/%d] Output:\n%s", size, i + 1, iters, output) + # 解析输出 + rst = HcclTestResult.parse(output) + results.append(rst) + logger.info("[%s][%d/%d] %s", size, i + 1, iters, rst) + + if i < iters - 1 and interval > 0: + time.sleep(interval) + + total_bw = math.fsum(rst.alg_bandwidth for rst in results) + aveg_bw = total_bw / iters + logger.warning("Data size: %s, average bandwidth: %f(GB/s)", size, aveg_bw) + + +def eval_gtest(): + """ + 评测算法分析器用例,执行 5 种数据量、3 种数据类型共 15 个用例 + + 正确结果样例: + + [----------] 15 tests from AllReduceTest (503 ms total) + + [----------] Global test environment tear-down + [==========] 15 tests from 1 test suite ran. (503 ms total) + [ PASSED ] 15 tests. + + 错误结果样例: + + [----------] 15 tests from AllReduceTest (233 ms total) + + [----------] Global test environment tear-down + [==========] 15 tests from 1 test suite ran. (234 ms total) + [ PASSED ] 14 tests. + [ FAILED ] 1 tests, listed below: + [ FAILED ] AllReduceTest.allreduce_contest_test_910b_512k_int8 + + 1 FAILED TESTS + """ + ld_library_path = os.getenv("LD_LIBRARY_PATH", "") + build_test_path = f"/home/hccluser/cann-hccl/build/test" + env = None + if build_test_path not in ld_library_path: + env = {"LD_LIBRARY_PATH": f"{build_test_path}:{ld_library_path}"} + + cmd = "./open_hccl_test" + logger.debug("Evaluating with cmd: %s", cmd) + _, output, _ = exec(cmd, env=env, pwd=build_test_path) + logger.debug("Output:\n%s", output) + + # 通过数量 + passed_match = re.search(r"\[ PASSED \] (\d+) tests?\.", output) + passed_count = int(passed_match.group(1)) if passed_match else 0 + + # 失败数量 + failed_match = re.search(r"\[ FAILED \] (\d+) tests?", output) + failed_count = int(failed_match.group(1)) if failed_match else 0 + + # 失败用例列表 + failed_tests = [] + if failed_count > 0: + failed_tests = set(re.findall(r"\[ FAILED \] (\w+\.\w+)", output)) + + logger.info("[ PASSED ] %d tests.", passed_count) + if failed_count > 0: + logger.info("[ FAILED ] %d tests, listed below:", failed_count) + for failed_test in failed_tests: + logger.info("[ FAILED ] %s", failed_test) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Evaluation tool") + parser.add_argument("--llt", action="store_true", help="LLT tests") + parser.add_argument("--hccltest", action="store_true", help="HCCLTest tests") + parser.add_argument("-p", "--npus", type=int, default=4, help="HCCLTest tests - NPU count") + parser.add_argument("-n", "--iters", type=int, default=10, help="HCCLTest tests - iterations") + parser.add_argument("-i", "--interval", type=int, default=5, help="HCCLTest tests - interval") + return parser.parse_args() + + +def main(): + args = parse_args() + + if args.hccltest: + logger.info("Evaluating by HcclTest") + eval_hccl_test( + npus=args.npus, + iters=args.iters, + interval=args.interval, + ) + + if args.llt: + logger.info("Evaluating LLT tests") + eval_gtest() + + +if __name__ == "__main__": + main() diff --git a/faq.md b/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..f80fdef4e87ef4b0c2b0b35d1e0ba803921da47b --- /dev/null +++ b/faq.md @@ -0,0 +1,118 @@ +# 一、开发环境 + +#### 1. 在我的开发环境中还需要自行安装工具包吗? + +选手环境已提前安装所有软件依赖,选手只需下载 cann-hccl 源码即可,下载方法详见 [参赛指导](./contest.md)。 + +#### 2. 我的开发环境是否有可能挂掉?代码会丢失吗? + +有可能,但概率非常小。开发环境遇到任何问题请及时寻求赛事工作组人员帮助。 + +#### 3. 开发环境中的 NPU 设备会存在多支队伍共用导致资源竞争吗? + +不会,每个队伍的开发环境中的 NPU 设备都是独占的,不会存在冲突。 + +# 二、算法开发 + +#### 1. execMem结构体和param结构体中都有count、inputPtr、ouputPtr变量,有什么区别? + +* param中保存的是本次调用算子的数据,count是本次调用算子在一个rank上总共要存放的数据数量,inputPtr和outputPtr是起始输入输出内存块的指针。 +* 由于CCL_Output buffer的大小有限,需要循环多次中转UserInput数据。每次循环的起始位置都做了一个CCL_Output大小的偏移量。所以execMem维护的是当前循环已经经过偏移的指针位置,count也是本次循环要搬运的数据数量。 + +#### 2. 为什么execMem.outputPtr是已经偏移后的内存指针,在跨卡搬运远端CCL_Output至本地Output时计算目的内存地址还要再加一个偏移值? + +跨卡搬运远端CCL_Output至本地Output过程中计算目的内存地址公式: + +```c++ +dst = DeviceMem::create(execMem.outputPtr + dstRank * param.DataDes.count * unitSize, curSize); +``` + +* 因为execMem.outputPtr中已经加上的偏移是每个rank在output区域相对于自己上一次循环使用的地址的offset。(rank内偏移) +* 而dstRank * param.DataDes.count * unitSize是rank之间在OutputPtr区域上的相对偏移。(rank间偏移) + +![image](img/offset_calc.jpg) + +【例】在算到rank 1在rank 0上的第二个Output指针时要先偏移Rank 0的Output 1 + Output 2 + Output 3,再偏移Rank 1的Output 1。 + +#### 3. 为什么allgather mesh算法实现中不需要ccl_input buffer但在rank本地搬运建链时src内存类型却是ccl input? + +```c++ +CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], + TransportMemType::CCL_INPUT, + TransportMemType::CCL_OUTPUT)); +``` + +因为单算子模式时userinput mem就是CCL_Input buffer,图模式时这两个变量的值才有区别。 + +#### 4. 在跨rank搬运数据,循环遍历除本端rank外所有远端rank时如何得到每个远端rank编号? + +`u32 dstRank = (level0CommInfo.localRank + round + 1) % level0CommInfo.localRankSize;` + +以单机8卡,localRank是rank0为例演示计算过程,如下表所示。round从0\~6(共7次循环),恰好覆盖除自身(0)以外的所有节点(1\~7)。 + +| round | 计算过程 | dstRank | 含义 | +| :---: | :-----------------: | :-----: | :-----------: | +| 0 | (0 + 0 + 1) % 8 = 1 | 1 | 与节点 1 通信 | +| 1 | (0 + 1 + 1) % 8 = 2 | 2 | 与节点 2 通信 | +| 2 | (0 + 2 + 1) % 8 = 3 | 3 | 与节点 3 通信 | +| 3 | (0 + 3 + 1) % 8 = 4 | 4 | 与节点 4 通信 | +| 4 | (0 + 4 + 1) % 8 = 5 | 5 | 与节点 5 通信 | +| 5 | (0 + 5 + 1) % 8 = 6 | 6 | 与节点 6 通信 | +| 6 | (0 + 6 + 1) % 8 = 7 | 7 | 与节点 7 通信 | + +#### 5. 在跨卡传输数据时,从流上在传输前后要进行前同步和后同步,目的是什么? + +* 前同步:确保双方进入 “传输准备” 状态(避免一方已发送,另一方未就绪)。 +* 后同步:确保数据拷贝完成后,再执行后续操作(避免竞态条件)。 + +#### 6. 为什么跨卡传输数据前后需要本卡的主从流都同步一次? + +* 传输前:主流要通知每个从流准备开始工作。每条从流要回复主流准备好了。 +* 传输后:每条从流要通知主流数据搬运结束。主流要恢复从流收到。 + +#### 7. 实现allreduce executor时可以继承非reduce相关类吗? + +可以,按照自己的实现思路按需继承即可。 + +#### 8. 由于在实现算法编排功能时会用到暂未开源的HCCL平台层接口,以下是比赛可能会用到的编排接口范围 + +| 接口名称 | +| ------------------ | +| HcclD2DMemcpyAsync | +| HcclReduceAsync | +| HcclReduceScatter | +| HcclAllGather | +| HcclReduce | +| HcclBroadcast | + +接口详细信息请参考: +[HCCL接口列表1](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/hcclapiref/hcclcpp_07_0001.html) +[HCCL接口列表2](https://gitee.com/ascend/cann-hccl/blob/master/docs/hccl_customized_dev/HcclD2DMemcpyAsync.md) + +#### 9. 在实现allreduce mesh算法时若用到HcclReduceAsync方法需注意跨rank搬运时要使用sdma协议,rdma协议暂不支持。 + +# 三、算法调试 + +#### 1. 算法分析器在检查mesh结构下带reduce的算子是否有内存冲突时可能误报。 + +解决方法:确认无内存冲突后手动关闭内存冲突校验功能:`checker.CloseRankMemCheck();` + +详情见:[集合通信源码定制开发指南](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88) + +#### 2. 使用算法分析器调试如何获得更多算法执行信息?怎么看? + +可以在校验时把算法执行时的Task序列打印功能打开:`checker.EnableTaskPrint();` + +检查以下字段是否符合预期: + +* srcSlice、dstSlice:src是要被搬运的数据在哪,dst是要把数据搬到哪。上图蓝色划线表示rank 0在把本卡UserInput buffer中的数据往本卡CCL_Output buffer搬运的两次循环,绿色划线表示rank 0在把本卡CCL_Output buffer往本卡Output buffer搬运的两次循环。循环两次是由于 UserInput buffer中的数据量大于CCL_Output buffer的大小。 +* BufferType:内存类型,比如UserInput/CCL_Output/Output buffer。 +* offset: + * 偏移,用于表示内存指针指向的变化。实际调试过程中出现内存越界、内存冲突原因是偏移计算错误的概率很大。 + * 可以看出绿色划线的CCL_Output buffer的offset一直是0,这是由于用于中转的CCL_Output内存地址一直是固定的。而UserInput和Output buffer的offset一直在随循环递增,每次增加的大小就是CCL_Output的大小。 +* size:内存块大小,最后一次循环的size是尾块数据。大小取决于UserInput是否能被CCL_Output整除。 + +#### 3. AllReduce的算法实现注意输入输出的tensor shape要一致。 + +假设rank 0数据为:[1, 2, 3, 4],rank 1数据为:[5, 6, 7, 8], +则经过allreduce后两张卡上的数据都应该是:[6, 8, 10, 12]而非[36]。(注意看初赛题的说明图) diff --git a/img/final_round_question.jpg b/img/final_round_question.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a6bc6582edd81b2a8cc20ac9b2f97aa1c0e81e0d Binary files /dev/null and b/img/final_round_question.jpg differ diff --git a/img/offset_calc.jpg b/img/offset_calc.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5d94577d844b358aeb4b5a5e606a1da5410c2180 Binary files /dev/null and b/img/offset_calc.jpg differ diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt index 0c48f81fd9c5f0f4a78af9386fdaea215614eb97..e8a1d66727c8817b18ba1e6d72b4334b89835a18 100644 --- a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt @@ -24,7 +24,9 @@ set(src_list ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_order_preserved_executor.cc ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_aiv_deter_executor.cc ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_aiv_deter_small_executor.cc - + ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_small_all_reduce_mesh_executor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_medium_all_reduce_mesh_executor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_huge_all_reduce_mesh_executor.cc ) target_sources(hccl_alg PRIVATE diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..a7f0de8eb828db7b82854ef4912dec9e4831e7c3 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "coll_custom_huge_all_reduce_mesh_executor.h" + +namespace hccl { +CollCustomHugeAllReduceMeshExecutor::CollCustomHugeAllReduceMeshExecutor(const HcclDispatcher dispatcher, + std::unique_ptr &topoMatcher) + : CollCommExecutor(dispatcher, topoMatcher) +{ +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize) +{ + // 计算所需要申请的 Scratch 内存大小 + // TODO: 选手可根据算法需要自行修改 + scratchMemSize = 0U; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u", + scratchMemSize); + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum) +{ + // 计算所需要申请的 Stream 数量 + // TODO: 选手可根据算法需要自行修改 + u32 totalStreamNum = topoAttr_.deviceNumPerAggregation; + streamNum = totalStreamNum - 1U; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) +{ + // 计算所需要申请的 Notify 数量 + // TODO: 选手可根据算法需要自行修改 + notifyNum = 2U * streamNum; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcCommInfo(std::vector &opTransport) +{ + // 计算通信域信息 + // TODO: 选手可根据算法需要自行修改 + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcNotifyNum]"); + + // CCL_Input -> CCL_Output + TransportMemType inputType = TransportMemType::CCL_INPUT; + TransportMemType outputType = TransportMemType::CCL_OUTPUT; + // 建立 Mesh 链路 + CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH); + // 构造一级通信域资源请求 + // 最终将调用:CalcMeshTransportReq::CalcTransportRequest() + CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType)); + return HCCL_SUCCESS; +} + +u64 CollCustomHugeAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize) +{ + // 计算循环处理的迭代次数 + // TODO: 选手可根据算法需要自行修改 + + u64 maxCountPerLoop = cclBuffSize / unitSize; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u", + maxCountPerLoop); + return maxCountPerLoop; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::Orchestrate(OpParam ¶m, AlgResourceResponse &algRes) +{ + // 算法编排总入口 + // TODO: 选手可根据算法需要自行修改 + + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count); + tag_ = param.tag; + algResResp_ = &algRes; + + // User_Input 和 User_Output 指针 + u8 *userInputPtr = static_cast(param.inputPtr); + u8 *userOutputPtr = static_cast(param.outputPtr); + CHK_PTR_NULL(userInputPtr); + CHK_PTR_NULL(userOutputPtr); + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize); + + // 循环处理数据 + for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) { + curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft; + u64 curSize = curCount * unitSize; // curSize 为三种数据量:512K/2M/64M + + // 构造本次循环所使用的内存信息 + ExecMem execMem; + execMem.count = curCount; // 本次循环处理的数据量 + execMem.inputPtr = userInputPtr + inputOffset; // 本次循环使用的 User_Input 内存指针 + execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针 + execMem.inputMem = algRes.cclInputMem; // 本端的 CCL_Input 内存 + execMem.outputMem = algRes.cclOutputMem; // 本端的 CCL_Output 内存 + execMem.scratchMem = algRes.scratchMem; // 本端的 Scratch 内存 + + // 处理本次循环 + CHK_RET(KernelRun(param, execMem)); + + // 更新偏移量 + countLeft -= curCount; + inputOffset = curSize; + outputOffset = curSize; + } + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::KernelRun(const OpParam ¶m, ExecMem &execMem) +{ + // 处理单次循环的数据 + // TODO: 选手可根据算法需要自行修改 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数 + u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小,三种数据量:512K/2m/64m,单位:字节 + hccl::Stream &masterStream = const_cast(param.stream); // 主流 + + // TODO: 流同步 + + CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1)); + SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0); + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u", + level0CommInfo.localRank, level0CommInfo.localRankSize); + + // TODO: 搬运数据 + + return HCCL_SUCCESS; +} + +REGISTER_EXEC("CustomHugeAllReduceMeshExecutor", CustomHugeAllReduceMesh, CollCustomHugeAllReduceMeshExecutor); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..707a5d66739c699d543659bd15487553596a7da7 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef COLL_CUSTOM_HUGE_ALLREDUCE_MESH_EXECUTOR_H +#define COLL_CUSTOM_HUGE_ALLREDUCE_MESH_EXECUTOR_H + +#include "coll_comm_executor.h" + +namespace hccl { +class CollCustomHugeAllReduceMeshExecutor : public CollCommExecutor { +public: + CollCustomHugeAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + ~CollCustomHugeAllReduceMeshExecutor() = default; + +private: + /* *************** 资源计算 *************** */ + HcclResult CalcScratchMemSize(u64 &scratchMemSize) override; + HcclResult CalcStreamNum(u32 &streamNum) override; + HcclResult CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) override; + HcclResult CalcCommInfo(std::vector &opTransport) override; + u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize); + + /* *************** 算法编排 *************** */ + HcclResult Orchestrate(OpParam ¶m, AlgResourceResponse &algRes); + HcclResult KernelRun(const OpParam ¶m, ExecMem &execMem) override; +}; +} // namespace hccl + +#endif diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..6647cfc91f95b43790688404c770a135be0ad646 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "coll_custom_medium_all_reduce_mesh_executor.h" + +namespace hccl { +CollCustomMediumAllReduceMeshExecutor::CollCustomMediumAllReduceMeshExecutor(const HcclDispatcher dispatcher, + std::unique_ptr &topoMatcher) + : CollCommExecutor(dispatcher, topoMatcher) +{ +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize) +{ + // 计算所需要申请的 Scratch 内存大小 + // TODO: 选手可根据算法需要自行修改 + scratchMemSize = 0U; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u", + scratchMemSize); + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum) +{ + // 计算所需要申请的 Stream 数量 + // TODO: 选手可根据算法需要自行修改 + u32 totalStreamNum = topoAttr_.deviceNumPerAggregation; + streamNum = totalStreamNum - 1U; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) +{ + // 计算所需要申请的 Notify 数量 + // TODO: 选手可根据算法需要自行修改 + notifyNum = 2U * streamNum; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcCommInfo(std::vector &opTransport) +{ + // 计算通信域信息 + // TODO: 选手可根据算法需要自行修改 + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcNotifyNum]"); + + // CCL_Input -> CCL_Output + TransportMemType inputType = TransportMemType::CCL_INPUT; + TransportMemType outputType = TransportMemType::CCL_OUTPUT; + // 建立 Mesh 链路 + CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH); + // 构造一级通信域资源请求 + // 最终将调用:CalcMeshTransportReq::CalcTransportRequest() + CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType)); + return HCCL_SUCCESS; +} + +u64 CollCustomMediumAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize) +{ + // 计算循环处理的迭代次数 + // TODO: 选手可根据算法需要自行修改 + + u64 maxCountPerLoop = cclBuffSize / unitSize; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u", + maxCountPerLoop); + return maxCountPerLoop; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::Orchestrate(OpParam ¶m, AlgResourceResponse &algRes) +{ + // 算法编排总入口 + // TODO: 选手可根据算法需要自行修改 + + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count); + tag_ = param.tag; + algResResp_ = &algRes; + + // User_Input 和 User_Output 指针 + u8 *userInputPtr = static_cast(param.inputPtr); + u8 *userOutputPtr = static_cast(param.outputPtr); + CHK_PTR_NULL(userInputPtr); + CHK_PTR_NULL(userOutputPtr); + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize); + + // 循环处理数据 + for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) { + curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft; + u64 curSize = curCount * unitSize; // curSize 为三种数据量:512K/2M/64M + + // 构造本次循环所使用的内存信息 + ExecMem execMem; + execMem.count = curCount; // 本次循环处理的数据量 + execMem.inputPtr = userInputPtr + inputOffset; // 本次循环使用的 User_Input 内存指针 + execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针 + execMem.inputMem = algRes.cclInputMem; // 本端的 CCL_Input 内存 + execMem.outputMem = algRes.cclOutputMem; // 本端的 CCL_Output 内存 + execMem.scratchMem = algRes.scratchMem; // 本端的 Scratch 内存 + + // 处理本次循环 + CHK_RET(KernelRun(param, execMem)); + + // 更新偏移量 + countLeft -= curCount; + inputOffset = curSize; + outputOffset = curSize; + } + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::KernelRun(const OpParam ¶m, ExecMem &execMem) +{ + // 处理单次循环的数据 + // TODO: 选手可根据算法需要自行修改 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数 + u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小,三种数据量:512K/2m/64m,单位:字节 + hccl::Stream &masterStream = const_cast(param.stream); // 主流 + + // TODO: 流同步 + + CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1)); + SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0); + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u", + level0CommInfo.localRank, level0CommInfo.localRankSize); + + // TODO: 搬运数据 + + return HCCL_SUCCESS; +} + +REGISTER_EXEC("CustomMediumAllReduceMeshExecutor", CustomMediumAllReduceMesh, CollCustomMediumAllReduceMeshExecutor); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..cfa4d4b328a589683d1547a02970f21a8e88af20 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef COLL_CUSTOM_MEDIUM_ALLREDUCE_MESH_EXECUTOR_H +#define COLL_CUSTOM_MEDIUM_ALLREDUCE_MESH_EXECUTOR_H + +#include "coll_comm_executor.h" + +namespace hccl { +class CollCustomMediumAllReduceMeshExecutor : public CollCommExecutor { +public: + CollCustomMediumAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + ~CollCustomMediumAllReduceMeshExecutor() = default; + +private: + /* *************** 资源计算 *************** */ + HcclResult CalcScratchMemSize(u64 &scratchMemSize) override; + HcclResult CalcStreamNum(u32 &streamNum) override; + HcclResult CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) override; + HcclResult CalcCommInfo(std::vector &opTransport) override; + u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize); + + /* *************** 算法编排 *************** */ + HcclResult Orchestrate(OpParam ¶m, AlgResourceResponse &algRes); + HcclResult KernelRun(const OpParam ¶m, ExecMem &execMem) override; +}; +} // namespace hccl + +#endif diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..7aebc74774810dea0138225f493241a014b1d612 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "coll_custom_small_all_reduce_mesh_executor.h" + +namespace hccl { +CollCustomSmallAllReduceMeshExecutor::CollCustomSmallAllReduceMeshExecutor(const HcclDispatcher dispatcher, + std::unique_ptr &topoMatcher) + : CollCommExecutor(dispatcher, topoMatcher) +{ +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize) +{ + // 计算所需要申请的 Scratch 内存大小 + // TODO: 选手可根据算法需要自行修改 + scratchMemSize = 0U; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u", + scratchMemSize); + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum) +{ + // 计算所需要申请的 Stream 数量 + // TODO: 选手可根据算法需要自行修改 + u32 totalStreamNum = topoAttr_.deviceNumPerAggregation; + streamNum = totalStreamNum - 1U; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) +{ + // 计算所需要申请的 Notify 数量 + // TODO: 选手可根据算法需要自行修改 + notifyNum = 2U * streamNum; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcCommInfo(std::vector &opTransport) +{ + // 计算通信域信息 + // TODO: 选手可根据算法需要自行修改 + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcNotifyNum]"); + + // CCL_Input -> CCL_Output + TransportMemType inputType = TransportMemType::CCL_INPUT; + TransportMemType outputType = TransportMemType::CCL_OUTPUT; + // 建立 Mesh 链路 + CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH); + // 构造一级通信域资源请求 + // 最终将调用:CalcMeshTransportReq::CalcTransportRequest() + CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType)); + return HCCL_SUCCESS; +} + +u64 CollCustomSmallAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize) +{ + // 计算循环处理的迭代次数 + // TODO: 选手可根据算法需要自行修改 + + u64 maxCountPerLoop = cclBuffSize / unitSize; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u", + maxCountPerLoop); + return maxCountPerLoop; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::Orchestrate(OpParam ¶m, AlgResourceResponse &algRes) +{ + // 算法编排总入口 + // TODO: 选手可根据算法需要自行修改 + + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count); + tag_ = param.tag; + algResResp_ = &algRes; + + // User_Input 和 User_Output 指针 + u8 *userInputPtr = static_cast(param.inputPtr); + u8 *userOutputPtr = static_cast(param.outputPtr); + CHK_PTR_NULL(userInputPtr); + CHK_PTR_NULL(userOutputPtr); + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize); + + // 循环处理数据 + for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) { + curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft; + u64 curSize = curCount * unitSize; // curSize 为三种数据量:512K/2M/64M + + // 构造本次循环所使用的内存信息 + ExecMem execMem; + execMem.count = curCount; // 本次循环处理的数据量 + execMem.inputPtr = userInputPtr + inputOffset; // 本次循环使用的 User_Input 内存指针 + execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针 + execMem.inputMem = algRes.cclInputMem; // 本端的 CCL_Input 内存 + execMem.outputMem = algRes.cclOutputMem; // 本端的 CCL_Output 内存 + execMem.scratchMem = algRes.scratchMem; // 本端的 Scratch 内存 + + // 处理本次循环 + CHK_RET(KernelRun(param, execMem)); + + // 更新偏移量 + countLeft -= curCount; + inputOffset = curSize; + outputOffset = curSize; + } + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::KernelRun(const OpParam ¶m, ExecMem &execMem) +{ + // 处理单次循环的数据 + // TODO: 选手可根据算法需要自行修改 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数 + u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小,三种数据量:512K/2m/64m,单位:字节 + hccl::Stream &masterStream = const_cast(param.stream); // 主流 + + // TODO: 流同步 + + CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1)); + SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0); + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u", + level0CommInfo.localRank, level0CommInfo.localRankSize); + + // TODO: 搬运数据 + + return HCCL_SUCCESS; +} + +REGISTER_EXEC("CustomSmallAllReduceMeshExecutor", CustomSmallAllReduceMesh, CollCustomSmallAllReduceMeshExecutor); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..f373a0e2552bfa697f6a34d91d8d4933d02454b1 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef COLL_CUSTOM_SMALL_ALLREDUCE_MESH_EXECUTOR_H +#define COLL_CUSTOM_SMALL_ALLREDUCE_MESH_EXECUTOR_H + +#include "coll_comm_executor.h" + +namespace hccl { +class CollCustomSmallAllReduceMeshExecutor : public CollCommExecutor { +public: + CollCustomSmallAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + ~CollCustomSmallAllReduceMeshExecutor() = default; + +private: + /* *************** 资源计算 *************** */ + HcclResult CalcScratchMemSize(u64 &scratchMemSize) override; + HcclResult CalcStreamNum(u32 &streamNum) override; + HcclResult CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) override; + HcclResult CalcCommInfo(std::vector &opTransport) override; + u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize); + + /* *************** 算法编排 *************** */ + HcclResult Orchestrate(OpParam ¶m, AlgResourceResponse &algRes); + HcclResult KernelRun(const OpParam ¶m, ExecMem &execMem) override; +}; +} // namespace hccl + +#endif diff --git a/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt b/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt index ccf812fc1fcb2faf7b662bdca2e88af2d7a22fc6..b1726cdc33ad5a3b00f7599a09ace3dc80e20c7a 100644 --- a/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt +++ b/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt @@ -13,6 +13,7 @@ set(src_list ${CMAKE_CURRENT_SOURCE_DIR}/send_operator.cc ${CMAKE_CURRENT_SOURCE_DIR}/receive_operator.cc ${CMAKE_CURRENT_SOURCE_DIR}/batch_write_operator.cc + ${CMAKE_CURRENT_SOURCE_DIR}/custom_all_reduce_operator.cc ) target_sources(hccl_alg PRIVATE diff --git a/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc b/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc index 626018dcfae059a434d159012a11adfc29a8901d..76bfd01ca32e1478b6ac2288d5e432fb543cb410 100644 --- a/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc +++ b/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc @@ -599,6 +599,6 @@ HcclResult AllReduceOperator::SelectAlgfor91093(const OpParam& param, std::strin return HCCL_SUCCESS; } -REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, AllReduceOperator); +// REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, AllReduceOperator); } diff --git a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc new file mode 100644 index 0000000000000000000000000000000000000000..66dface0e95568ae535d3fe3202695de8ae29b7e --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "custom_all_reduce_operator.h" +#include "coll_alg_op_registry.h" + +namespace hccl { + +CustomAllReduceOperator::CustomAllReduceOperator(AlgConfigurator *algConfigurator, CCLBufferManager &cclBufferManager, + HcclDispatcher dispatcher, std::unique_ptr &topoMatcher) + : CollAlgOperator(algConfigurator, cclBufferManager, dispatcher, topoMatcher, HcclCMDType::HCCL_CMD_ALLREDUCE) +{ +} + +CustomAllReduceOperator::~CustomAllReduceOperator() {} + +HcclResult CustomAllReduceOperator::SelectAlg(const std::string &tag, const OpParam ¶m, std::string &algName, + std::string &newTag) +{ + constexpr u64 HCCL_CONTEST_SMALL_COUNT = 512 * 1024; // 512KB + constexpr u64 HCCL_CONTEST_MEDIUM_COUNT = 2 * 1024 * 1024; // 2MB + constexpr u64 HCCL_CONTEST_HUGE_COUNT = 64 * 1024 * 1024; // 64MB + + // 算法选择逻辑 + // TODO: 选手可根据数据量大小选择合适的 Executor + // 注意: + // 1. 相同算法在不同数据量下的性能不同 + // 2. 选手可以先只实现一个 Executor,算法选择时直接设置 algName 为该 Executor 的名字 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 dataSize = param.DataDes.count * unitSize; // 单位:字节,三种数据量:512K/2M/64M + if (dataSize <= HCCL_CONTEST_SMALL_COUNT) { + algName = "CustomSmallAllReduceMeshExecutor"; + } else if (dataSize <= HCCL_CONTEST_MEDIUM_COUNT) { + algName = "CustomMediumAllReduceMeshExecutor"; + } else { + algName = "CustomHugeAllReduceMeshExecutor"; + } + return HCCL_SUCCESS; +} + +// 注册算子 +REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, CustomAllReduceOperator); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..41ae73d0b325fbfcc8d0fb6e41073f0fb2e9e9da --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef CUSTOM_ALL_REDUCE_OPERATOR_H +#define CUSTOM_ALL_REDUCE_OPERATOR_H + +#include "coll_alg_operator.h" + +namespace hccl { +// 数据规模分类 +enum class HcclDataCountType { HCCL_COUNT_SMALL = 0, HCCL_COUNT_MEDIUM, HCCL_COUNT_HUGE, HCCL_COUNT_RESERVED }; + +class CustomAllReduceOperator : public CollAlgOperator { +public: + CustomAllReduceOperator(AlgConfigurator *algConfigurator, CCLBufferManager &cclBufferManager, + HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + + ~CustomAllReduceOperator(); + + HcclResult SelectAlg(const std::string &tag, const OpParam ¶m, std::string &algName, + std::string &newTag) override; +}; +} // namespace hccl +#endif diff --git a/submit.sh b/submit.sh new file mode 100755 index 0000000000000000000000000000000000000000..b7b0d0ab5dcdf15b34c94e8e4b9997d974b64511 --- /dev/null +++ b/submit.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e + +# bash fonts colors +red='\e[31m' +yellow='\e[33m' +green='\e[92m' +none='\e[0m' + +error() { echo -e "${red}$*${none}" && exit 1; } +warning() { echo -e "${yellow}$*${none}"; } +info() { echo -e "${green}$*${none}"; } + +src_dir="/home/hccluser/cann-hccl" +dst_dir="/result" + +operator_dir="src/domain/collective_communication/algorithm/impl/operator" +executor_dir="src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce" + +files=( + "${operator_dir}/custom_all_reduce_operator.h" + "${operator_dir}/custom_all_reduce_operator.cc" + "${executor_dir}/coll_custom_small_all_reduce_mesh_executor.h" + "${executor_dir}/coll_custom_small_all_reduce_mesh_executor.cc" + "${executor_dir}/coll_custom_medium_all_reduce_mesh_executor.h" + "${executor_dir}/coll_custom_medium_all_reduce_mesh_executor.cc" + "${executor_dir}/coll_custom_huge_all_reduce_mesh_executor.h" + "${executor_dir}/coll_custom_huge_all_reduce_mesh_executor.cc" +) + +for file in "${files[@]}"; do + file_path="${src_dir}/${file}" + if [ -f "${file_path}" ]; then + cp -i "${file_path}" "${dst_dir}" + info "Copied: ${file_path} to ${dst_dir}" + else + error "No such file: ${file_path}" + fi +done + +info "All files copied successfully to ${dst_dir}" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bbd859a0a1b8c3c909b22ca164e61fe8bac762de..8475b050c67bc7d31151d64bc486044d4e8e236b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -418,6 +418,7 @@ set(src_list_alg ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/registry/coll_alg_op_registry.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/coll_alg_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_reduce_operator.cc + ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/custom_all_reduce_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_gather_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_gather_v_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/reduce_scatter_operator.cc @@ -477,8 +478,7 @@ target_compile_options(hccl_alg_test PRIVATE -fno-strict-aliasing -pipe -std=c++14 - -Os - -O2 + -O0 -g -fstack-protector-all $<$:-fsanitize=address -fsanitize-recover=address,all -fno-omit-frame-pointer -g> ) @@ -521,9 +521,10 @@ add_custom_target(hccl_alg_test_lib COMMAND cd ${CMAKE_INSTALL_PREFIX}/hccl_lib ) -add_custom_command(TARGET hccl_alg_test POST_BUILD - COMMAND ${CMAKE_STRIP} $ -) +# 禁用 strip +# add_custom_command(TARGET hccl_alg_test POST_BUILD +# COMMAND ${CMAKE_STRIP} $ +# ) install(TARGETS hccl_alg_test LIBRARY DESTINATION lib OPTIONAL diff --git a/test/algorithm/testcase/main.cc b/test/algorithm/testcase/main.cc index db0348934bb91d55ae548c96ff7623af97950ea4..74d7e921f7c4e28ecb4233882d2772a884b8dabd 100644 --- a/test/algorithm/testcase/main.cc +++ b/test/algorithm/testcase/main.cc @@ -2,7 +2,7 @@ GTEST_API_ int main(int argc, char **argv) { // testcase调试代码,只跑特定的用例 - //testing::GTEST_FLAG(filter) = "AllReduceTest.allreduce_cyw_test"; + testing::GTEST_FLAG(filter) = "AllReduceTest.allreduce_contest_test*"; testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/test/algorithm/testcase/testcase_all_reduce.cc b/test/algorithm/testcase/testcase_all_reduce.cc index 7dc31ad1ebe5cc70593e73665371ef611ef8a618..e3f10e34a4135e08d828173674cf7257e5f82dc0 100644 --- a/test/algorithm/testcase/testcase_all_reduce.cc +++ b/test/algorithm/testcase/testcase_all_reduce.cc @@ -1751,4 +1751,394 @@ TEST_F(AllReduceTest, allreduce_aiv_determinstic_test) ret = checker.Check(checkerOpParam, topoMeta); // EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); -} \ No newline at end of file +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 512k + int8 + u64 size = 512 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 2m + int8 + u64 size = 2 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 64m + int8 + u64 size = 64 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 1g + int8 + u64 size = 1 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 4g + int8 + u64 size = 4LLU * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 512k + fp16 + u64 size = 512 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 2m + fp16 + u64 size = 2 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 64m + fp16 + u64 size = 64 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 1g + fp16 + u64 size = 1 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 4g + fp16 + u64 size = 4LLU * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 512k + fp32 + u64 size = 512 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 2m + fp32 + u64 size = 2 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 64m + fp32 + u64 size = 64 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 1g + fp32 + u64 size = 1 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 4g + fp32 + u64 size = 4LLU * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +}