From 45a690248b7c0d972511dc768937992e0d740ea7 Mon Sep 17 00:00:00 2001 From: zangyan Date: Mon, 14 Jul 2025 09:11:19 +0000 Subject: [PATCH 1/7] !65 update README.md. Merge pull request !65 from zangyan/r1.5.1 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f58944..04bb336 100644 --- a/README.md +++ b/README.md @@ -232,7 +232,7 @@ HCCL软件包安装完成后,开发者可通过HCCL Test工具进行集合通 ## 相关文档 -HCCL提供了使用指南、环境变量参考、基于本源码仓进行定制的开发指南、算法分析工具使用指导等,详细可参见[HCCL资料书架总览](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88)。 +HCCL提供了用户指南、环境变量参考、基于源码仓进行算法与算子定制的开发指南等,详细可参见[HCCL资料书架总览](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88)。 ## 贡献指南 -- Gitee From 1a02ac41525904486020d8f9b535817065a9ea98 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Fri, 18 Jul 2025 13:52:56 +0000 Subject: [PATCH 2/7] =?UTF-8?q?!70=20[HCCL=E7=AB=9E=E8=B5=9B]=20=E8=B5=9B?= =?UTF-8?q?=E9=A2=98=E6=A1=86=E6=9E=B6+=E8=B5=9B=E9=A2=98=E6=8C=87?= =?UTF-8?q?=E5=AF=BC+=E8=B5=9B=E9=A2=98FAQ=20*=20contest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 6 + CMakeLists.txt | 2 + Dockerfile | 193 +++++++++ contest.md | 280 +++++++++++++ faq.md | 118 ++++++ img/offset_calc.jpg | Bin 0 -> 29402 bytes .../coll_all_reduce/CMakeLists.txt | 4 +- ...ll_custom_huge_all_reduce_mesh_executor.cc | 142 +++++++ ...oll_custom_huge_all_reduce_mesh_executor.h | 36 ++ ..._custom_medium_all_reduce_mesh_executor.cc | 142 +++++++ ...l_custom_medium_all_reduce_mesh_executor.h | 36 ++ ...l_custom_small_all_reduce_mesh_executor.cc | 142 +++++++ ...ll_custom_small_all_reduce_mesh_executor.h | 36 ++ .../algorithm/impl/operator/CMakeLists.txt | 1 + .../impl/operator/all_reduce_operator.cc | 2 +- .../operator/custom_all_reduce_operator.cc | 51 +++ .../operator/custom_all_reduce_operator.h | 31 ++ submit.sh | 42 ++ test/CMakeLists.txt | 11 +- test/algorithm/testcase/main.cc | 2 +- .../algorithm/testcase/testcase_all_reduce.cc | 392 +++++++++++++++++- 21 files changed, 1660 insertions(+), 9 deletions(-) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 contest.md create mode 100644 faq.md create mode 100644 img/offset_calc.jpg create mode 100644 src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc create mode 100644 src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h create mode 100644 src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc create mode 100644 src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h create mode 100644 src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc create mode 100644 src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h create mode 100644 src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc create mode 100644 src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h create mode 100755 submit.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4d70f07 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.vscode +.idea +build +output +__pycache__ +*.log diff --git a/CMakeLists.txt b/CMakeLists.txt index c45e7ae..e1fe63f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,8 @@ project(hccl) option(BUILD_OPEN_PROJECT "Build open hccl project." ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + if(BUILD_OPEN_PROJECT) include(cmake/config.cmake) add_subdirectory(src/domain/collective_communication) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ae431ab --- /dev/null +++ b/Dockerfile @@ -0,0 +1,193 @@ +# NOTE: Building this image requires docker version >= 18.0 + +ARG TARGETPLATFORM=linux/arm64 +ARG BASE_IMAGE=ubuntu:22.04 +ARG PYTHON_VERSION=3.10 + +# 阶段 1:安装依赖 +FROM ${BASE_IMAGE} AS base + +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + apt-transport-https \ + ca-certificates \ + build-essential \ + bash \ + curl \ + git \ + wget \ + gcc \ + g++ \ + make \ + cmake \ + zlib1g \ + openssl \ + unzip \ + pciutils \ + net-tools \ + gfortran \ + patchelf \ + libblas3 \ + libblas-dev \ + libssl-dev \ + zlib1g-dev \ + libncurses5-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + libffi-dev \ + libnss3-dev \ + libgdbm-dev \ + liblzma-dev \ + libev-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /var/tmp/* \ + && rm -rf /tmp/* + +# 阶段 2:安装 Conda +FROM base AS conda-installer + +ARG TARGETPLATFORM +ARG PYTHON_VERSION + +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") ARCH=aarch64 ;; \ + *) ARCH=x86_64 ;; \ + esac && \ + curl -fsSL -o /tmp/miniconda.sh -O "https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-${ARCH}.sh" + +RUN chmod +x /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -b -p /opt/conda && \ + rm /tmp/miniconda.sh && \ + /opt/conda/bin/conda install -y python=${PYTHON_VERSION} && \ + /opt/conda/bin/conda clean -ya + +# 阶段 3:安装 CANN 8.2.RC1.alpha003 +FROM conda-installer AS cann-installer + +ARG TARGETPLATFORM +ENV PATH=/opt/conda/bin:${PATH} + +RUN pip install --no-cache-dir -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \ + attrs cython numpy==1.24.0 decorator sympy cffi pyyaml pathlib2 \ + psutil protobuf==3.20 scipy requests absl-py + +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") ARCH=aarch64 ;; \ + *) ARCH=x86_64 ;; \ + esac && \ + CANN_TOOLKIT_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-toolkit_8.2.RC1.alpha003_linux-${ARCH}.run" && \ + curl -fsSL -o /tmp/Ascend-cann-toolkit.run -O "${CANN_TOOLKIT_URL}" && \ + CANN_COMMUNITY_SDK_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-communitysdk_8.2.RC1.alpha003_linux-${ARCH}.run" && \ + curl -fsSL -o /tmp/Ascend-cann-communitysdk.run -O "${CANN_COMMUNITY_SDK_URL}" + +# 安装 CANN Toolkit +RUN chmod +x /tmp/Ascend-cann-toolkit.run && \ + /tmp/Ascend-cann-toolkit.run --quiet --install && \ + rm /tmp/Ascend-cann-toolkit.run + +# 安装 Community SDK +RUN chmod +x /tmp/Ascend-cann-communitysdk.run && \ + /tmp/Ascend-cann-communitysdk.run --quiet --full && \ + rm /tmp/Ascend-cann-communitysdk.run + +# 阶段 4:下载 HCCL 仓库及其依赖 +FROM cann-installer AS hccl-installer + +WORKDIR /workspace + +RUN curl -fsSL -o /tmp/include.zip -O https://github.com/nlohmann/json/releases/download/v3.11.2/include.zip && \ + unzip -d /workspace/nlohmann_json /tmp/include.zip && \ + rm /tmp/include.zip + +# 安装 MPI +RUN curl -fsSL -o /tmp/mpich.tar.gz -O https://www.mpich.org/static/downloads/3.2.1/mpich-3.2.1.tar.gz && \ + tar -zxf /tmp/mpich.tar.gz -C /workspace && \ + cd /workspace/mpich-3.2.1 && \ + ./configure --disable-fortran --prefix=/workspace/mpich --with-device=ch3:nemesis && \ + make && make install && \ + rm -r /workspace/mpich-3.2.1 && \ + rm /tmp/mpich.tar.gz + +# 设置环境变量 +RUN \ + # Conda 环境变量 + echo 'export PATH=/opt/conda/bin:${PATH}' >> /root/.bashrc && \ + # NPU 驱动环境变量 + echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:${LD_LIBRARY_PATH}' >> /root/.bashrc && \ + echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:${LD_LIBRARY_PATH}' >> /root/.bashrc && \ + # CANN Toolkit 环境变量 + echo 'source /usr/local/Ascend/ascend-toolkit/set_env.sh' >> /root/.bashrc && \ + # MPICH 环境变量 + echo 'export PATH=/workspace/mpich/bin:${PATH}' >> /root/.bashrc && \ + echo 'export LD_LIBRARY_PATH=/workspace/mpich/lib:${LD_LIBRARY_PATH}' >> /root/.bashrc + +# 阶段 5:安装 SSH +FROM base AS ssh-installer + +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + openssh-server + +# SSH 配置 +RUN echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \ + echo "PermitUserEnvironment yes" >> /etc/ssh/sshd_config && \ + echo "ClientAliveInterval 60" >> /etc/ssh/sshd_config && \ + echo "ClientAliveCountMax 3" >> /etc/ssh/sshd_config + +# SSH 启动脚本 +RUN echo '#!/bin/bash' > /start.sh && \ + echo 'if [ -n "${ROOT_PASSWD}" ]; then' >> /start.sh && \ + echo ' echo "root:${ROOT_PASSWD}" | chpasswd' >> /start.sh && \ + echo 'fi' >> /start.sh && \ + echo 'mkdir -p /var/run/sshd' >> /start.sh && \ + echo 'ssh-keygen -A' >> /start.sh && \ + echo '/usr/sbin/sshd -D -e' >> /start.sh && \ + chmod +x /start.sh + +# 最终阶段:安装运行所需依赖,复制前面阶段结果 +FROM ${BASE_IMAGE} AS official + +ENV ROOT_PASSWD=change_me + +SHELL [ "/bin/bash", "-c" ] + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + apt-transport-https \ + ca-certificates \ + bash \ + libc6 \ + libsqlite3-dev \ + git \ + gcc \ + g++ \ + gdb \ + make \ + cmake \ + file \ + vim \ + netcat \ + curl \ + wget \ + openssh-server \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /var/tmp/* \ + && rm -rf /tmp/* + +COPY --from=cann-installer /opt/conda /opt/conda +COPY --from=cann-installer /etc/Ascend /etc/Ascend +COPY --from=cann-installer /usr/local/Ascend /usr/local/Ascend +COPY --from=hccl-installer /root/.bashrc /root/.bashrc +COPY --from=hccl-installer /workspace /workspace +COPY --from=ssh-installer /etc/ssh/sshd_config /etc/ssh/sshd_config +COPY --from=ssh-installer /start.sh /start.sh + +EXPOSE 22 + +WORKDIR /workspace + +CMD [ "/start.sh" ] diff --git a/contest.md b/contest.md new file mode 100644 index 0000000..26b6e9f --- /dev/null +++ b/contest.md @@ -0,0 +1,280 @@ +# HCCL 通信库创新大赛操作指导 + +## 0. 赛前须知 + +### 0.1 技能要求 + +1. 熟悉 C++14 编程语言 +2. 了解 GDB、LLDB 等调试工具 +3. 了解 VSCode、CLion 等 IDE 开发工具 +4. 了解 AllReduce 等集合通信原语 + +### 0.2 资料 + +HCCL 资料: + +- [昇腾社区官网][1腾社区][2] +- [HCCL概述——昇腾社区][3] +- [集合通信原语——昇腾社区][4] +- [HCCL代码仓][5] +- [HCCL Wiki][6] + +定制算法开发指南: + +1. [HCCL源码定制开发指南][7] +2. [AllGather 定制算法实现][8] +3. [HCCL 通信库创新大赛参赛 FAQ](./faq.md) + +[1]: https://www.hiascend.com +[2]: https://www.hiascend.com/hccl +[3]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/hccl/hcclug/hcclug_000001.html +[4]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/hccl/hcclug/hcclug_000004.html +[5]: https://gitee.com/ascend/cann-hccl +[6]: https://gitee.com/ascend/cann-hccl/wikis +[7]: https://gitee.com/ascend/cann-hccl/blob/master/docs/hccl_customized_dev/README.md +[8]: https://gitee.com/ascend/cann-hccl/pulls/64 + +### 0.3 评分标准 + +组委会将从功能、性能、代码风格 3 个维度对参赛代码进行综合评测,评测公式: + +- 15 分功能分:15 个算法分析器用例,每个 1 分,通过得 1 分,不通过得 0 分 + + > 5 种数据量:512k/2m/64m/1g/4g,3 种数据格式:int8/fp16/fp32 + +- 75 分性能分:3 个 HCCLTest 用例,每个 25 分,不通过得 0 分,通过则按照性能计分,性能最佳得满分,按照排名依次递减 + + > 3 种数据量:512k/2m/64m,1 种数据格式:fp32 + > + > 性能标准:基于 HCCLTest 工具测试的带宽使用量(字段:`alg_bandwidth(GB/s)`)作为评判标准,数值越高越好 + +- 10 分主观分:代码风格 + +> 【注意】验证方法详见 [算法分析器验证](#71-算法分析器验证)、[HCCLTest工具验证](#72-hccltest-工具验证) + +## 1. 登录环境 + +选手开发环境信息将通过邮件的方式发送至队长邮箱,队伍成员可通过 SSH 进入选手开发环境: + +```bash +ssh root@ip -p port +``` + +## 2. 环境目录 + +选手开发环境是运行在物理机上的 Docker 容器,目录结构如下: + +``` +|-- /dev +| |-- davinci1 # NPU1 +| `-- davinci2 # NPU2 +|-- /etc/Ascend +| `-- ascend_cann_install.info # CANN 安装信息 +|-- /usr/local/Ascend +| |-- ascend-toolkit # CANN Toolkit 安装目录 +| `-- driver # NPU 驱动安装目录 +`-- /workspace + |-- cann-hccl # HCCL 代码仓,选手需自行下载 + |-- mpich # MPICH 安装目录 + `-- nlohmann_json # nlohmann json inclue 目录 +``` + +## 3. 软件版本 + +> 【注意】 +> +> 1. 选手开发环境中已安装下列软件依赖 +> 2. 最终评测环境的软件版本与选手开发环境一致 + +- gcc 11.4.0 +- g++ 11.4.0 +- make 4.3 +- cmake 3.22.1 +- mpich 3.2.1 +- CANN Toolkit 8.2.RC1.alpha003 +- CANN Community SDK 8.2.RC1.alpha003 + +## 4. 代码开发 + +### 4.1 下载代码 + +> 【注意】选手只需下载 [ascend/cann-hccl](https://gitee.com/ascend/cann-hccl.git) 代码仓即可,编译运行所需全部依赖已提前安装 + +```bash +cd /workspace + +git clone https://gitee.com/ascend/cann-hccl.git -b r1.5.1 +``` + +### 4.2 IDE 远程开发 + +推荐选手基于 VSCode、CLion 等 IDE,通过 SSH 连接开发环境进行远程开发,参考文档: + +- [VSCode 使用 SSH 远程开发](https://code.visualstudio.com/docs/remote/ssh) +- [CLion 使用 SSH 远程开发](https://www.jetbrains.com/help/clion/remote-development.html) + +### 4.3 定制算法开发 + +在 HCCL 软件架构中,`Operator` 负责算法选择,`Exeutor` 负责算法编排。为简化流程,选手只需实现以下内容: + +1. [custom_all_reduce_operator.cc](src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc) 中编写算法选择逻辑 +2. [coll_custom_small_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc) 中编写小数据量(512K)场景的 AllReduce 算法 +3. [coll_custom_medium_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc) 中编写中等数据量(2M)场景的 AllReduce 算法 +4. [coll_custom_huge_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc) 中编写大数据量(64M)场景的 AllReduce 算法 + +> 【注意】上述代码文件中,选手需要实现的内容已在代码注释中标明 + +## 5. 编译代码 + +编译所需的依赖项均已安装,在 HCCL 代码仓执行编译即可: + +```bash +cd /workspace/cann-hccl + +bash build.sh --nlohmann_path /workspace/nlohmann_json/include +``` + +## 6. 安装编译结果 + +编译生成的 HCCL 软件包在 `/workspace/cann-hccl/output` 目录下: + +```bash +cd /workspace/cann-hccl/output + +./CANN-hccl_alg-8.2.t12.0.b077-linux.aarch64.run +``` + +安装完成后,用户编译生成的 HCCL 软件包会替换已安装 CANN 开发套件包中的 HCCL 相关软件 + +## 7. 测试代码 + +### 7.1 算法分析器验证 + +> 【注意】算法分析器能够在无昇腾 NPU 场景下离线测试算法逻辑,包括:死锁检测、资源校验、内存冲突校验等 + +编译并执行算法分析器用例: + +```bash +cd /workspace/cann-hccl + +# 编译测试用例 +bash build.sh --nlohmann_path /workspace/nlohmann_json/include --test --open_hccl_test + +# 执行测试用例 +./build/test/open_hccl_test +``` + +### 7.2 HCCLTest 工具验证 + +> 【注意】性能测试场景可使用 HCCL Test 工具进行验证,该工具基于真实 NPU 设备进行功能和性能测试 + +基于 HCCL Test 工具在 NPU 设备上执行验证: + +```bash +cd /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_test + +# 编译 HCCL 性能测试工具 +make MPI_HOME=/workspace/mpich ASCEND_DIR=/usr/local/Ascend/ascend-toolkit/latest + +# 执行 HCCL Test +# 512K +mpirun -n 2 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 2 +# 2M +mpirun -n 2 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 2 +# 64M +mpirun -n 2 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 2 +``` + +> 工具详细说明可参考:[昇腾文档中心-HCCL 性能测试工具使用指南](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/devaids/hccltool/HCCLpertest_16_0001.html) + +## 8. 提交代码 + +执行下列脚本,将选手代码拷贝到 `/result` 目录下 + +```bash +bash /workspace/cann-hccl/submit.sh +``` + +该脚本将选手编写的定制算法文件拷贝至 `/result` 目录下,用于后续评测: + +1. `custom_all_reduce_operator.h` +2. `custom_all_reduce_operator.cc` +3. `coll_custom_small_all_reduce_mesh_executor.h` +4. `coll_custom_small_all_reduce_mesh_executor.cc` +5. `coll_custom_medium_all_reduce_mesh_executor.h` +6. `coll_custom_medium_all_reduce_mesh_executor.cc` +7. `coll_custom_huge_all_reduce_mesh_executor.h` +8. `coll_custom_huge_all_reduce_mesh_executor.cc` + +## 9. 结果公布 + +赛程结束后统一公布成绩 + +> 【注意】选手开发环境与最终评测环境完全一致 + +## 10. 调试代码 + +### 10.1 日志 + +#### 10.1.1 日志打印 + +选手可通过调用日志宏保存日志到文件中,便于调试: + +```c++ +HCCL_DEBUG("[HCCL_CONTEST] Orchestrate start"); +HCCL_INFO("[HCCL_CONTEST] Total count: %u", totalCount); +HCCL_WARNING("[HCCL_CONTEST] Cost: %u ms", cost); +``` + +#### 10.1.2 日志设置 + +1. 日志级别 + +HCCL 日志级别默认为 Error,下面通过环境变量设置为 Info 级别: + +```bash +export ASCEND_GLOBAL_LOG_LEVEL=1 # 0: debug, 1: info, 2: warn, 3: error +``` + +2. 日志目录 + +设置日志存储目录: + +```bash +export ASCEND_PROCESS_LOG_PATH=/workspace/log # 默认为:$HOME/ascend/log +``` + +设置日志输出到控制台: + +```bash +export ASCEND_SLOG_PRINT_TO_STDOUT=1 +``` + +3. 日志数量 + +设置每个进程最多保留的日志数量为较大数字,以防丢失: + +```bash +export ASCEND_HOST_LOG_FILE_NUM=1000 +``` + +### 10.2 Core dump 问题 + +使用 gdb 调试: + +> 【注意】选手本地开发编译 HCCL 代码时默认已开启 `-O0 -g` 编译选项,但最终评测时会开启 `-O3` + +```bash +cd /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_test + +# 512K +gdb --args mpirun -n 2 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 2 +# 2M +gdb --args mpirun -n 2 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 2 +# 64M +gdb --args mpirun -n 2 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 2 +``` + +### 10.3 Wrong answer 问题 + +请选手仔细排查定制算法是否符合 AllReduce 算法逻辑 diff --git a/faq.md b/faq.md new file mode 100644 index 0000000..f80fdef --- /dev/null +++ b/faq.md @@ -0,0 +1,118 @@ +# 一、开发环境 + +#### 1. 在我的开发环境中还需要自行安装工具包吗? + +选手环境已提前安装所有软件依赖,选手只需下载 cann-hccl 源码即可,下载方法详见 [参赛指导](./contest.md)。 + +#### 2. 我的开发环境是否有可能挂掉?代码会丢失吗? + +有可能,但概率非常小。开发环境遇到任何问题请及时寻求赛事工作组人员帮助。 + +#### 3. 开发环境中的 NPU 设备会存在多支队伍共用导致资源竞争吗? + +不会,每个队伍的开发环境中的 NPU 设备都是独占的,不会存在冲突。 + +# 二、算法开发 + +#### 1. execMem结构体和param结构体中都有count、inputPtr、ouputPtr变量,有什么区别? + +* param中保存的是本次调用算子的数据,count是本次调用算子在一个rank上总共要存放的数据数量,inputPtr和outputPtr是起始输入输出内存块的指针。 +* 由于CCL_Output buffer的大小有限,需要循环多次中转UserInput数据。每次循环的起始位置都做了一个CCL_Output大小的偏移量。所以execMem维护的是当前循环已经经过偏移的指针位置,count也是本次循环要搬运的数据数量。 + +#### 2. 为什么execMem.outputPtr是已经偏移后的内存指针,在跨卡搬运远端CCL_Output至本地Output时计算目的内存地址还要再加一个偏移值? + +跨卡搬运远端CCL_Output至本地Output过程中计算目的内存地址公式: + +```c++ +dst = DeviceMem::create(execMem.outputPtr + dstRank * param.DataDes.count * unitSize, curSize); +``` + +* 因为execMem.outputPtr中已经加上的偏移是每个rank在output区域相对于自己上一次循环使用的地址的offset。(rank内偏移) +* 而dstRank * param.DataDes.count * unitSize是rank之间在OutputPtr区域上的相对偏移。(rank间偏移) + +![image](img/offset_calc.jpg) + +【例】在算到rank 1在rank 0上的第二个Output指针时要先偏移Rank 0的Output 1 + Output 2 + Output 3,再偏移Rank 1的Output 1。 + +#### 3. 为什么allgather mesh算法实现中不需要ccl_input buffer但在rank本地搬运建链时src内存类型却是ccl input? + +```c++ +CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], + TransportMemType::CCL_INPUT, + TransportMemType::CCL_OUTPUT)); +``` + +因为单算子模式时userinput mem就是CCL_Input buffer,图模式时这两个变量的值才有区别。 + +#### 4. 在跨rank搬运数据,循环遍历除本端rank外所有远端rank时如何得到每个远端rank编号? + +`u32 dstRank = (level0CommInfo.localRank + round + 1) % level0CommInfo.localRankSize;` + +以单机8卡,localRank是rank0为例演示计算过程,如下表所示。round从0\~6(共7次循环),恰好覆盖除自身(0)以外的所有节点(1\~7)。 + +| round | 计算过程 | dstRank | 含义 | +| :---: | :-----------------: | :-----: | :-----------: | +| 0 | (0 + 0 + 1) % 8 = 1 | 1 | 与节点 1 通信 | +| 1 | (0 + 1 + 1) % 8 = 2 | 2 | 与节点 2 通信 | +| 2 | (0 + 2 + 1) % 8 = 3 | 3 | 与节点 3 通信 | +| 3 | (0 + 3 + 1) % 8 = 4 | 4 | 与节点 4 通信 | +| 4 | (0 + 4 + 1) % 8 = 5 | 5 | 与节点 5 通信 | +| 5 | (0 + 5 + 1) % 8 = 6 | 6 | 与节点 6 通信 | +| 6 | (0 + 6 + 1) % 8 = 7 | 7 | 与节点 7 通信 | + +#### 5. 在跨卡传输数据时,从流上在传输前后要进行前同步和后同步,目的是什么? + +* 前同步:确保双方进入 “传输准备” 状态(避免一方已发送,另一方未就绪)。 +* 后同步:确保数据拷贝完成后,再执行后续操作(避免竞态条件)。 + +#### 6. 为什么跨卡传输数据前后需要本卡的主从流都同步一次? + +* 传输前:主流要通知每个从流准备开始工作。每条从流要回复主流准备好了。 +* 传输后:每条从流要通知主流数据搬运结束。主流要恢复从流收到。 + +#### 7. 实现allreduce executor时可以继承非reduce相关类吗? + +可以,按照自己的实现思路按需继承即可。 + +#### 8. 由于在实现算法编排功能时会用到暂未开源的HCCL平台层接口,以下是比赛可能会用到的编排接口范围 + +| 接口名称 | +| ------------------ | +| HcclD2DMemcpyAsync | +| HcclReduceAsync | +| HcclReduceScatter | +| HcclAllGather | +| HcclReduce | +| HcclBroadcast | + +接口详细信息请参考: +[HCCL接口列表1](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/hcclapiref/hcclcpp_07_0001.html) +[HCCL接口列表2](https://gitee.com/ascend/cann-hccl/blob/master/docs/hccl_customized_dev/HcclD2DMemcpyAsync.md) + +#### 9. 在实现allreduce mesh算法时若用到HcclReduceAsync方法需注意跨rank搬运时要使用sdma协议,rdma协议暂不支持。 + +# 三、算法调试 + +#### 1. 算法分析器在检查mesh结构下带reduce的算子是否有内存冲突时可能误报。 + +解决方法:确认无内存冲突后手动关闭内存冲突校验功能:`checker.CloseRankMemCheck();` + +详情见:[集合通信源码定制开发指南](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88) + +#### 2. 使用算法分析器调试如何获得更多算法执行信息?怎么看? + +可以在校验时把算法执行时的Task序列打印功能打开:`checker.EnableTaskPrint();` + +检查以下字段是否符合预期: + +* srcSlice、dstSlice:src是要被搬运的数据在哪,dst是要把数据搬到哪。上图蓝色划线表示rank 0在把本卡UserInput buffer中的数据往本卡CCL_Output buffer搬运的两次循环,绿色划线表示rank 0在把本卡CCL_Output buffer往本卡Output buffer搬运的两次循环。循环两次是由于 UserInput buffer中的数据量大于CCL_Output buffer的大小。 +* BufferType:内存类型,比如UserInput/CCL_Output/Output buffer。 +* offset: + * 偏移,用于表示内存指针指向的变化。实际调试过程中出现内存越界、内存冲突原因是偏移计算错误的概率很大。 + * 可以看出绿色划线的CCL_Output buffer的offset一直是0,这是由于用于中转的CCL_Output内存地址一直是固定的。而UserInput和Output buffer的offset一直在随循环递增,每次增加的大小就是CCL_Output的大小。 +* size:内存块大小,最后一次循环的size是尾块数据。大小取决于UserInput是否能被CCL_Output整除。 + +#### 3. AllReduce的算法实现注意输入输出的tensor shape要一致。 + +假设rank 0数据为:[1, 2, 3, 4],rank 1数据为:[5, 6, 7, 8], +则经过allreduce后两张卡上的数据都应该是:[6, 8, 10, 12]而非[36]。(注意看初赛题的说明图) diff --git a/img/offset_calc.jpg b/img/offset_calc.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5d94577d844b358aeb4b5a5e606a1da5410c2180 GIT binary patch literal 29402 zcmc$G2|Sc<+xBgjC6pygjVzHhdxViflI;6dse}+^%Q8h$Bq3y-vhT7^gehxCcCzpL zI@V#f@9zIR75cx=^Stl-eZNo5?>EbRU+a0D*LfW0aovRPgi(P0vZ}f&Ktx0YTm%0B zgaP0JKtfFX<3IRE3jQafBqJjwC8MICAg83EqM@OtqNb)je1x9%FvDSLYIr{HpC6n`A{j*?rs;4_nM`znvb4Hv&dRLm^LSlM__@$sJ)kd%^^IV&r7;i8JF z+9mbNdin;2M#d(l);Dk6wz*?#cmIK_o4beSL;oiMPXnJl4+@WX9T^q~yt<~guD;<*V^c?GS9eeE_rCtI@rlW)>6zKNdDPna#^%;GdS~}X zzlZ>me{}0t&kpp90qhqsDJcml#gBdw5xavg5(ZMTlVapY&g)QIcRtD`{+Nlj`{tMdiJYhzx&k>(2x*;lSje;KmnXyMl@bA zg#b9i;`B?75P*g*__FA_9S{}7|CZ`u1J+Lzx!_L#W>4iAC|Kjqvy5zhCIA~pjrf72 zAbw(A0uY)B9kiP67QTo?ZSrA)vHhuUiT}D6iuDtME%?D^xxiZ?`18!r&Ck#c2Jp^| zIJ{yqSS2#HtK=xGp$oPwvu^rVjl`L=1YnJ_fB@tjA^>lL%Jes2JEX=0pqG>Yw5b5r z-fHO3v5_ZWT7mLA~>BQLLsH(Z^f6U9%$3crnfwY`1h zl|lUVqc@hnsZ#l3b1(z#CenCy#2N>#-@CbW#jtP(;S{qqII_S=0K9(;XKMa)0$|@7 zVtEqnKE7nC0qfVRv7%MU$}zQhlIwl$M1i{y+~&w3QM0?@11oN~NY&n}O}%eEB47sV zhm0fY6fTT1A)YvGw9*Zsw`=4AhBF+6YI}MjE^?92(fchGi8BU&HMsrYa4pGYx&%vT z&r8cVmCa6#?|gTlo*T#sriHC&P1lmvBGHwjS_L1Ct z)#pF$hlCWL^J=ybG=84uclBdo6Ah{2_3#=Gt<8S|KjN)t zKhe184TU_D%%}gHrjq`p?GhVbknxQja*kWh5!=tLBMsK9`sl`CL3nN~BieQqF?Kse z(ssj4r5ptd_I{SmW-jV1sAlWLku}HrHR}-X>m2)5p?f7vEnfD%X5NEc`Lh%GY0>2s z>TAV`QPapF@+!Rbii03t!3zJeg#aX*Rz!?Lmcmx;$BaT6p2+J>d=!?bg?>C@`iw8; znuxECP*{Bo*bCI_Etno?-9jYrgfu5 z=x7T|J+!c{J~^jvz^*q@?qk=A`~`VbpiGCt*Y@`jvR|kGU{!YXW{-=*eW|Pcu{6!H?k$gjHM#z=Y;j1ipf^Uu%iroW2ZinK{Jk|uh3=4A!Fnk`r=V&6mde+F0AR5Lu+eic)B{f=02_pbd=@Ind5v;}4->&Q56~%Y?<;I-bEfL{v`PZNB9D61G z5;Nbsw=A|jZtvi*BvoN9!@yCohJk~U|6@>Ous&+|37FKsACvtfoSmxn@WDs%Xy*CE z%hx1nF0rfmY~~*Dm7eZPQmgPh#$dpuo-=ti^81#K_Gml|E+{DG188FQm7CcO*d zuWOy1>$>gRh9q`fc^;L>hN642=^&$m4jU7!t8ClUcysg8rL!iv=>jG%#M3SSDXZsh zLhdC6qz$dCcVYv5LmiA7m%Rg1eOqB1SkZAPyX`I{*qc4gL#=@=vc4c=q@<*_zaD;H zo=1bJpt^pRk}_O9oMpN%?EL~3f{U)WwYqJH=5>W-&FZE<#wujYv5XIXomO{br8-ua zU8!0VK0vuh>vR66(P!QXEwg`6xWmhzx~6+4Mg*MZw0;L{|CQA_#X}wm1mMmHqqh>< z%LrUV7FrofSHs-ry_sNROD*ox_U8IJ1s}9#xFY`(vH%%U#q2>n){{L(>r}IAR2NEZ z#!kJX6o-+pD75Vx?wP7~;yg{pv$lQOoTxo|(Ud>5Jlq2MO^X0HhWm8BmrPkE0J&Vq zt!~FwgUrWcpK~;mtGCnYdr&bgwe*6mZ0ePWC$C-$eJt`bP(SvCX1co4*x=`5GuAO#on_8*X@T;X4)!oJRJX$M5{Lc2Z{ePCI@H(oyH4+q=H@uh97^ zCSFGDzC}Bq8}FVrj&Z2_pm=ZS&9TWH+s>JZ7ltK!9nfoF|59w9vLXPjEeJdbf&j1> z@xP_PJj<^rh>N?wp0DACUj^HivfmeEF2lM*w>t@dGLtZ%xBs`~CL{rX$2nmsobV!f zI!dh4&ckuRO9vN$a+5%GpkL?wm0T7^v#ynwRL|M&V5dKRNy^NLV6HlD+4-H4BKvd zd+1I^RW}88D-^%W_^Tkm3cVYFtHV3!b{p)iE%kQ|#rjru{&+7n0^mcX__F{a@=yZ6 z84TGSh3`^kN5WIJ7#~{1%0J6{|5o5D5yhSHT--l-OV?CKSsy9fYv}6jg#4;&r*$hbk{6x3im}d~Q0ZAOp23^V zMJ8xHPV`kM3CQTRZXf_-x*qKKC#`g_(JVSVS14LPksREsBx**`0$>#SEH5u;T^4?S zz12Z)2R`fqU(J-K1+Bv-5V>jqZUzi$SOF~(SX<243YO_S0jM=HQsHR*;AIE@ieGl^3Bt}?u%Bl?RP~r&!@L3;%RdIcw^A!_V znu%>)k<~gl0?BwLzQ@SfE34ahs<+_oDVUw~(q>ShL~9!6*>tcZ$ieZkuwcBeIJT4p zlfZn_J6O^n%(_LqRi&-!q_J(!D`VpUbTWN^e|d?n zu6j%qqr*?A=n;TJ%JE_(`0J*w5851>^+p=)t{Oi-Esg8lNgnhUfR%S@{a{xU+X=vk@*~cDOSoup=DDM2@TEtR zpE8AHPngn#)2K}}PJVzP%D=@}QIDv8 z-2E>RkNI2Ts(ZLJ=w!AxXtnP}K+l=HQNFAAH(xg8e`-B{D{o{ z%a`Rl3?CRC!fW9eHqZ-dn!0-WJPcffgRvd#2yD^T3MXaT-5H3E*^q@`z9Op$fTULm zqmISQK$tlJ$cC(>6=bmjxyIH}wAS0H!}g5K?dE_@&W?dnDX387dYJILlB zY&IqPHSo)u>X-bCCiRJg8FiG%e!PAG$hZ3)eMXPqN-=U*v&u~r`w@{1LnyH^8tjWV z_wQD-Ko4}1!G{2tLkPfmDVIW5;}z9t56khPh(_;#$B(&fSE_RN?@VoEO#=HI_&#S( zB=-vzM-?4xjP8G@BmlJLz3A3NPTxIUqv*7=W9DQUiBj$3V)v1QI~MVj_Za31tkDn+ zdY)?*P6R-0&UFK_cf0Xk!CVrq#u}nwGtC@u6mJh|s$LI*?39~&X)k~zkzcIA^`yV6 z2NA+FB%ak$2*r!=uMo>xKom3!r^VUx4mJYhsuy)@`FV)%oeMLRv@kvHuJXQHxVvL` zZTQ@q9wm|hR~jN9EFNMyi9ZKxWI?s!9_N8HfSw_|MqStGX%$nrj0v~HsjlqIFWilm z=hL@%FlL(sfZLrkQ`N`Jei?M1V!p>c2!IwkZtRW}PK5o9&~%isfk?NZl66>EA=4Xc z0q54sbM{lBB(Q4Zk-}U49!euSD_Q0x6ShQ}a|IavD(BLn9&?|;`-$=6C3mZMO|f!l zzKL9?nnWQK;Kd>34?H9jP2cRvY-(F0&7$7GsdDD{ae( zhsp~BWVQy4qAPE?x9#2tLeJEkIE8xpd>nq*j?EmxDrL41{QXAHNz~P70&t}4TG*qQ zo6Mz5AWnR9{BVvy+e!R)04UB6ROXvx6PHLdixr4#C?G!2?mog`z=&qSZZ#co^@uAx_cAscJ4qLRjn%j#ohsN9ee*KGg-;g{J z{m8EN^^>*iR=y;@E6e;4MyrGEXd9ooJvukh|f zr3}k}qwweTr;DE&_?oZ6p&U2<^k{o_KL5S3a&lP^X0c;>dN3sLEW$}^dzM5wD1BdO zuC~{?0Om=V`@`gRC6?d8?H=Oq5$_L5FcEAlXYk7IC;`|#lpU!BZV#_iwsH4hIMVJW zfIle+cn+JG*V<;;Dbe^%3S{2-i%(447ldkoA?hoYP~1H&9H~JQAUH$UZ;iimBOm$Py)|;El`9&LNsaFD`*{o`YxN8(CQM9f8JRO&PqP7G*mi>X8yi*~og^t5t zA9|x5zvn1WZ+Rnpxa-={HJ0TlxBT%p)(DXLbUOZKZJbnP>SBz(;OACZZpEZZ#~??| zVVv^WpfJN=gk|(FW~J)mO6KG+En0oTfyA)$X)el4uUC>*!T)?3%;tiV&|?pk*wG_v z84wQ+$FqD)omg=+NY7AB57g3wU>J4>W^XSeSLzKHY+8S&Xf^Ecyr-ebZp0MWoK(5&q&d) zSRS^Tjz0>28s&Qg--Ti6>u;&*+k#2BzxpzA|6cu4Vuekm`-ZNrUCRRmUIQ_BxKec| zu4f55OaQ{x;m<4hGCk6_QhK2U@)z!pIoJ(UZ>>0Fe5SgZ9n)uWGXQVG__E<0)%h=Z z4K?thsp5xLL4m=p7$nO=)u~cx_ub+!;R53`ZVfc{kuPXwNH|{|K4+Ak7M!k@-I$dc zP&glcQFwUsx{ieEVP%&3@we+QCZt^8GUdsWy~6|2i6Y5`{)Oh1`AHV;CEZH}qv{w5 zbf9*JWQc?%I>sya`)ssf(VSaLni|c$9FfC3{g|{HLG%(b%~5B0UbrQmLu+Pv1do%4 zfFu?jy3mQnF3PaJdXk#4{N?=jFFaynU;16>T3YW~bL?s|;!^V9AnogmxG1k_xICh9 zH`iv$9`?BlwEwSDf9G|&pdjmqUPng|0PD*HU@rS=UOcq6_1zZ&FnkNovXeZNkrcF| zIJhU^>e|EQ{Uen8zL)PS(_e&5ENFvq=>!2#;RPefB+wOynjibG|C>!V&pS)8w zCLsW4a^=cFJaSYqkedK_Mx(8!Lp_s{es)*q=Wyk%FIE4JuDi{G?-+_bd*wZURtGnx zyQgmgqQ83r`87^}-(E7ewaD+(hbme&`XbahEqbEG{$sK=j6+egb=Ox;tdnX2@1^&*l|HHo0b6{V%F!&Epyr12a-v0Rf zFVp4v8+4Ur`j=g%Cli1#Tn+dSe0V>BDZO?uoL~T_&HN}B-7~p7qWVEb^cJ_$f~g@- zuk3m0Sc8}j*9?`q#RZz%7o=mA9dD42Gd(+gFTJg|^LM%_MlHs;x{}xOb*79x{15@C z9sn`;Vc#DZJO_fcfUK5cL1y6s9Ap;kL1ux)%A?(U#K@u*j;~I~E~3xjnMXhvejhX_ z+G{^ZY>-?)faC%P7KCa3#!z%Qrv5-KOEK?b;1&~f#-_&VVBu<5VI)Xf%z(578)%hv zy~C0^o34Y<_n8FXoZ{g>%7%_PpxN<(P>?3+p~H(*Eym$)LRR27(yn433wOFj0ob=z zf8_p)LeXZvKe&`ukV}#B2Z{FliaW{k3m;(D_3&=q#|Z$n6UcAK{y-Vz??D%7GrPYHuG=Au|7CwF z8@A-T7B_RL$C>a6?qs--h9C*r?wnvAU7|GG`OAgjhgpp|+)3T?-|j`BxhK`n4@n>? zqG~8~x?aSU7q|rR=eAz^f9F-xr$<22vjl-Bl>zxFogBo7Eo{Tyd(mpg!^w~U+$i4T zc8MtBU<5$1X9P_@f>k*Ia@$yp8Eo_d$Toq5Vq^EsN;!*BFE3Xu-Zeuv9@_NMrjmW7 ztAak}cj)z+NUH@2Ql$1ySV||5O9MGzyiYE2^&}DPGS|FYXV5qK7(I`}TNhXhDleMI7of7o31cq@*qf)(`7L*QHq*^4?H zo{7U{`{QBYS`6D5**hr)9VwM;7H>4X+_ZSln7WaMB$qbrd^}icm*L1UykP%;3Y8t6 z_B0zQS%s{7JW=Q_rOwj3L1@W6SSuR3rug>25bbSt6&MW!bfawWe#11Nb8=iE08x^# z>Une#==Ge)VPi7xU}}v`q4-?jRvsP5S+)I%iRK zogMml!LS0fx}B|3FeUi(2dO!QfxUn+EhY1pBmFOlw6+F{>=rRY*XcK(I z0FuJ$M{$!bTF4{52OYB_m1E}=cc#6TS;Jk=r_NZ`djh> zhA=uRz=41yb3dQ^7dy%DD}9}5_bai}EewD}D{wF*Ik4}SDUtmhgXi-%Uq)IzPISAD zU(jnC2P5)zx^F+orc?515=w^k<{QacX;0|T8xezKYJrq^13s-Kp2m6u}9de(!uxg*+e-8J6;MnFCYBM zL{)l9CZ>^p$aJBlAG5o$k%Yk;cLcToaB*iZIAyT+2$L^~v)gd0bOv!RstriWs#4-1Quv>$9 zq{FZ}`;|B54fG8stQ#Vg@2@@OEbaMp&WwX@aT6t2eP}iNvz(wu_C*Wp#WUa{ut`vb z6nj=mdwL-pTaVS-^7Kdcl*5+lMdA}=_>Su>O+H-C&N7{SXfd7aBR(CJ? z-LDO^L{c-m%m@^9eLt7|u5+6=^t$IUxK^jsxzl-hJWn#jiaS!;bFS<(8mLPX%UXw7 z7}Y{)8z&${=wRii=NbE>;0h@&GAcGNGWZ5~j95okUih-M%R|zQJ>v9AS9_m5STJ77 zm$XnmfBbtSJuz4C%9R{don#sax1$!5ShwP?845Wo;OzTtHDqXW;Z|xD&UH;rlni^d zvqI}_gQxHHHm6=~AFlEG<+1z6=f(7>#2VgQ75@MfzPcw%VsDCadp_1xCYP9ReXXa5 z0N6!}IO>XrIreir<)$*5YMkIZZa=eRf3fQUjJyU^CN()o_b)WGd4x|NvL4`#d^z>p z(tsnAGGmcW@Wb+FQlslh=FdfTvv$Eg`e0Bp^$1b4W1Otcm!Gv4YiRMN~UY@Mi2K3z{;^1(|@ zwptdTzbD~G=H@yDEh5jp<+*BA6LrSov(#3c)0X|f%~z}rw`ls*S+ZmbKIa=|Wxc-L z<#u*KML8uYGUvOF3RBoKqYPhggF?-q`0(|S1fbI}!P3g|kG#i&G5jf?!)o|-(N$#2 zi6xLW^4yx-gRTEB>SRc*Zflu9S0-Q~7fV427Au%UT5GQ~8_w1Lol^oZ;t#=A!`QKR ze!a6Zh9An|V?jj@0y_{IPZixSySO^=bCje~z`)Rh|#l{6b9M%by{PRub zowrokj!0YQXiPAOZrQsl=dFtPa^Tx2WIf2>{#5Y+DqA-Z ztHb31jUop=lPBg@gU^!1S&KX9C0@zU=Qwu-C`N9QT^vDeLNFoNo;00l!!!BnMyUdt z{WI0De~&ICjbyre@6UkgvHuV*p|T$SR_Xl4BfbACJo2^@T`)Q)G-Ia4;bBLe^nxqO zL79~S0y%y2$mwf`#Yt#(UzTjg|HeV7KfqUynx_3X{G_k5?Z^@DY&3{I$Mb6z;y6cJ zXiM@VxbyGjISroP7VwSi`2&tksK}gGOpa2&dUe@pS?xbSVT0vH*oR1*dmI5!Eiq38 z@g@8}1&VgR6(bNMiy90Se7$H4dJs@1AHc*5D{TC`<;#%o`UD^WPI0gYUha#JeXR5E z!0276slVjs*q{`zmlwmm|B2t?ImF+eU8Y>JI^VrFr)a7i?q6=>ODFvFaB?QAjOz{9 zPb&`n`6e|9q2QNWSN^O)bh|ev1UI&#`^#7*_v9{tzg;CXn6n0XqK`$_z5+F2ARK60 zZ&k1x`DxSI;5z#)sVh@r@cFe2m|Dtni_=A`PZWwRZDt$%%5Va|E|L88PD+^Z_c~G0 zPyv{JOK<&`I$>>!Gx%L_od#$0Y&jK|VaOT=EoIc^N0!>i1DLe-UXZ-^!&{PlZlJN- z>6C*AN;9OIgxc?G`>1z%;)z_Ja;II2K`@oB1A=@am!R~mIpi$lKA6N+0hx#PeV^J$ z7k;L?vrJpf`l>o+7Fi(99zzf?4XgXFOX`v-@7;2Z-!@`4Vley6uE8C_+Mgw^MK{Y& z?65}-ELiOgN%!a+?+qx19~ttJ_C>9G73fuXoRM_7iKN7#PJ$)tuti-_oh=kr_iAalR!GEvbhBMNj_d{pQk%Ox?WzntxyLuV?Wf*mDd_4Zv)ogw z<|*REN9(_|Ym)lsxYUlyJ3`AxPn>U%`s@vIv6r17Rzu#qwZr_CPnSVQV9YF@Nqhoh zC>*DDMOw|Y234Q#JeL3Bgg|9`s`NJBr=sDZLe5)9YXqr{uDMWOYa&-WKc~MKjx>N_ zqvP;IBby#gE~${`w!WeIA8nEFajyWw(r%#@k>w?a<FwB3b<jcNzrtV6@ z@vmQwZ|fIzTJ||EkCxbHs#H)*20gW<X~uuf(0-cMaK#+8@>G#}K?cu@ zL{wGuV$3iU?*C^4F zdVOOR!hRXC6JV!US102|0OG9Pg{7irlzQb=GG^u6vK@S~ZNC&+nhjK~J4M~1T0Bb| zDXZ&#SRCL!7de?o-?{Vv-Vdo`Hk5Ab>-#)bDSg&qY2pxv9q$Js12ur?yHRe3t%+1G zwjW2COLl0j!JcnBg#c(L1qobHuDr0N1;*;o=g=+5e0*&2{Mzmx0*3t}1v~zvkF5x< zJ%T%?U%G^UoYQgJ%fJ}y4Yhup)cKhv%>11p`b+cnL%BxvE8pWuQBT*87!&;jWiXw! zJcqg&%be|MCwd)$VOpx6)hHw%qH#=~Rz1ci|6tAnS`7=XqC`);twN5%=xfGl`5~t# z15#dIif1Of8B*QII>9OT!jJL`XP8&h!ZqK8qP7)KM)(WqDE@i}=2`Y?3)Vw}kxUy_ z0X8+vrm0oQ&d?xNu_WU-rM$3jsE>jgmA!|a$bPb3v~T-e7IPK>5KRo-JYR%pT_*si zG*o`UpI%m!)%b{?F9q5fe+0te;U`Krr)uz+HzqY~>ea@j38il{tULnofchuW4hLc5 z4yTK5_P=AL_0}g)Hg5{lhUSo ze!{G7$)3ry#M+W(tR9%Dv<8#R7?NSwY;<1ePY9Q1H6G89UPH%HbdH}*VT?ER8pSds zU-IdNKRlhe(dUoaNJ?)S$iok9ITwJ*Yq$gjS~ti~@Cj&Go%QX6tQk~+h~qnp85qs% zTKF*iY)bD}SV=%yCSPrSU~sZ!|Mwx-O?#D8WNG*km1n=Bl0lUCVdrnl4gT>j=Dlgw zQS)fi;4jk|d3qR`2Iz*^4}~o0QIG_6fb><#f*Ge^Fyq7tCjb%lUlB1oGd!>aU+ARk z1EbTpxC!{~u?jF_=649>NTov;M>xPeu?|dw(Svs0SZwxHmK(c(A^=uErLoV-Apa@*`C=p7!B>N zlyygudP=HVfC=Ccy2Hf6XZIq$?%!6vYdSsnt!FD3HHWyGaE@as^pmD|mxtr&iVOK8 z#%P!MbGfIQ#yz~VEgl7EMe*9Q*iy;Y-v+LoY5W$IyE-p?P8;QhOdGMIcFyU0U*0jn zpio(xNfg3C!um?FjhyMzSyu95XpuJ}_L#fBP{oBTk4N9l`Ufnu#)5l7gnL}I2YAR&${Iw>to3WYeI&g^ zWdE^Ok-6Wlitl5ugH(c<^2e%D4bAhCja>Sb&kucvcsm+Ax=l2}|4D}89W(6TH7Czy zCc2q_O;wdL;!@UU6z$PA_4z zH^9^?B_EjV?T;F$9Nvv-9zmxeA!{NgXo33X2Da!RT{Hg-tG*g*p)1W(q|XyxXAHzo zUx*KvyC>*drghMgV7JjT_@|;^!qg9j*G!#qxy8iiE`{FcEa+}GPd`xM!D;D0#K?!? zUD04RsJr)1O6kzSS*t{BDV9`N935wycRw>^MaOtUJ!jm@BM#5|&{vE#?8^j6lq_eW z#N*EMUJc0bI-!fXJh*oIX~f zB)sj>s3h2R4`)sZGo{CXSYieCXKWV-YQ0m(Y6Ibp0V! zJh-H03=LZ84#Z{{)K6lRDkPQ%5RNVBtV8iz4N?=xqfqO7`i0o^I$`S1{nK$t9@OJx zDel|-KHt{1*uJ{t_<2}U8YIOvQ42)rt>3h{WBLqJ+~L)4-?8q5GV|OEl~v1ATdItLTDj6ghi@9;}N5IJrw_aYmM^aEp`r z@n}CgZm4#{ykL}Sfg8<)>TF09LR;lyl~H_tVcy5{cxSetPr1s1hRvKwPR1k}2(2$) zdk4IfvY6p{Pdzt6;!9@_rNKkcHOk#0cWplQNG+M^BfQj)CmZLb+Psjxse6o^{SoQm z+RlV7UZ>}$&3aayJHc%4Fb;l;-aX!_$Qg;(g5eN1%O%U<;@G?BIj4yc5QY{(>gbM? zR0T6NJRhy}T^IC32ztQBL|F(IL6Rtz-E_cA&VA z+marmP!?;Zd&q2{(M>X#=TWwhI;VlTjdP3V3#&qz=Zkn!?BUe_+_b+d^ij$EOnc5| z!GR&O$IfLZznZvQe0#mxzg4j+VfFrKZxEi$W5A$Yk{WkT8!h{=#q)Zq<>2M$ewsy- zl&6D~eYX&|IRk8z<6cVmS-o4!iagTSr0G_YyfbjM$H&w@QGW21ldthe)fxd%@S5wG z!-ROZk2}Z>)Pd?%FkzWf;(UB6{b>cQbWv*mR~aFh@Zo2ct`ulLfk(~_6nnHpI?Em&PKdWxk%%Z@J{Z2*Ti1O+wfkP3aqRu?bl@_7t>Q}digVaU)EdRCYR zX5fef^xOvy8n{aqLWuh?7}&gsvvRTZH9aySIVG}%<%%9}Gx`Rqj<60B&?ENlcnTEM z|K_Spjf0`5WHy`~Oa!Pra2$QLV^38_KXk{Dgz;#TlJaM@mE)AVTpz&FVtuM|SA*-f zjGTpf5NyXcbn8dyo7&ED`Zrv-WOL!|`Qv&1X*D$VjA8?-&)H}MGPD7b_e`KCezLu8 zhoqKg_qw+vT=y-+M)KthZ_Ha|b%k%5t`7jCZ;;{!Y@|~la*GU1Mr`;-TdDQ_@tuhQ znsw$jWVuA)Fjo+N{u%7(55d|O6w8YTbQE#YtTRbPu&spAD)upVskb~~6d-%lGIbXWWGbOb) zh5Ve))k|*QIyS9x zSB#W-t=^M0t5Dr43poaB-ko>DSzOiyk(f`*kZAABq?aqxGy-;{_iE-6*^2fL+ zhj*2g?YX(r>-cZq>3H199p1fnfA-ZTG};p?<(|lhYMDl&j@HLAh7APMvI<%D7)Mq# ztS8wxOcqeQ5_@!ZGfCUIa`&jfksy~@WDDO;a%obP9ZDuhUI#^T5q&N%cR3#LN$F*=Fg5Vj_T)B#>KhV`&4fe_|M<{OE zY77>u&>zGPO7gzs{XUYYavz@qtAg*+|5LrZ_q0pozQfdjz)yi)mg(Lk_K&A5{K)ZA z!TR4502h#~A-VQ`y z&4W!z*?Wav{?B9GnRkfcduy3ioZMz-hT9-)9B*F}Rg8=AppVSd}=U6*QJ5-Rk38s|}{WvC}%<>Gjf=>XE zE~vJ&$$_;6j6*O4ARV@5aWP-At9)TQe8@yyy~K6q#5VS;D3P~3N~;D(r{1Qd zaxIhfDdcb~M_~r^91F&leIVR8b8?0v^`Q!QF!tZfKxf43gueI**^iCcyUE%AfsxM( zsr@wZjnF>Jt#jZB8LEZTe05YXcWN*`cx9L{NMgzQN6%cBAq7K|{Z9i?hsV9BL&R-R zS}nE@+Ka;`G&(;GTDxCWYNheuJ}XO-=2sjFWAwl)f3Gvy@8rNEq!$z? z^q9}mVZ4uZivlaeS=#nw448+`z3PD|f)DKl< zQP(@O;BnoF={F3)wshE{^R3MdmrM5C61?|BBmaIU_i{!*AUBm=MD{{Y|M>x2f7>s@ z>@DTqHn;C757Z*&s~J6-3IT;M*O3_1DIgAV^m8nUZ(+|6|yN$D$swyCZnUY&V@y`DE$#e{MU zPBSwewk@xmbZa^NlGD$FI`ZfRPR;qbFh7IR@^aDSJFp`aC&#@u3)(9TlI(hOQtL`y zCew7#%Ozyhj%rh={~hlH`I|3q8I9T`yn&T>ZJZ6_PpYU{e*~T_BKayAVr_*^&$rJ{ zv+qP(*O)3SmYZu`cOs2;?`0U(;Fg@@;%#KeIqsHup@+BRGQ@uvF%IfS1(R*@-uW)z=4t|_D6u_@R>t=l!zUA&}q z`F$+LDgJ`GlhD27FE8>xJQXZ?9s1@>ok96(UOq~~0;Tq>ziHx9VO&k`2}t|WCaIW~ z?CYrTu?hLwlxNl1W2-qPMlS~3g6!>m<&oHEt+q{B&M96&ep{P#*WT=smG2w2-uII( zq|QDJ9v*Ksi#qmTM?M|5-t;l~*g)hL)}2QU-k?AT9&!d9SMMv;k>}vJB=o>UZ*g5Q z!kuBQA|U*Ngq(xnpqX7lYBVYNyoX20aV?WoWVHf0gn9vGb*DcqUh4LT&Qn2x&{NW? ziq`xljHUP5qqJptZrZ#p@SC}S%CAF<-W5&KH056r%Bzddbr7<@$$<1`X*Q%uakx#< z#+I2%CS8J%sxmKy!N0uG4k)`>V(VSG;A<#XDKnAo=jySsvm|AkT7Y~IG@>Kt)6`cS zx@|2(E_S5K*W$%zfA18MI#)JW(1S`ni|%4CvwRudjwh;X5aGbc>C<=cH_r3{iHE>m zek(tf>O7&<7g{q+)*&yO4a&>rOU(xAqMn>;Ipkt2N0%d>xE z6~=WdFeIVaC+}k_Of?RZ?{yU2I z46)1i#(p*KZe6|mc7@89S&zD~T3=IDV}CswdhecZeNBGCzs0s=Pos~^Vm-MYmczc9sBSf5ACz^x7!Q+0Roxm=-Yq1dU7nhBUf!^Va*YIM_*dQEYo9j7{S(@QAF zlH)Ih{KD)JJ62Pcq694W`z108T zbK(r1U}UN=iH*hz@IJ5VnD_?!P%-9X*1u0ewPuC)YTOY;>r5iC+YxJ4tDyXe24^U3 zA%c35H>y0DP2|6pc$Uej5w^3!H;Ek6w%>PzU+H%x{6CA*FV-eA*jr)h&tJ5EKR$@E zA{u;9!J4lIo{4O+G2Nuq-T49PFZIK#!`(XJ9#-d#1mB>OE@Q<}&J?7!EN{5+A1VP) zn?UTLJa5lc-_UVQuR6KFzz066-6F+Ey4jwdE1V?{$WJBOj>aV3w#ZXvy!hqH4+(_m zBdE;~bZq5WgmDQ<@nc3)ScR|3q8pd^F>7dfiE1mBs$A5s0?dg*aF2wb8;Np@v&5pj zpuWEX9uD#y-uajtwhZ$~`l{pS6}mU>NLJd`m<(&!?Hrj)W*@tGLw5G+SqcuS-O{RS zxEwHqO~rzd?t9e|@bLr1O)H=3C2Fh1jWpvwz(>EVj-b!yz-GC@EIWS*9UiKN{9+vL{JHXadv3$1r>;H zHF$6$g8+;QV*M6pkYnUv#Pl{4)G8!(LD4wK>W}PCn;NKu(6NJHec!)B-9}C*zzm94 zi)>5T3?t5&4BEv;Rz#ojiUcZjl*82mu8fxXnmtM<%W4R$xPPDgV;0)0cKlnbUgweu zx(adXoLRQAvFBUaQ(UR{noJ&zKIKGx%ea<+bY)k73lw#L0{kj_gLlbKBC*KBPxBA- zAj3)V#N2ji#6V*P`U%gMb_7;mX2$P~hXg&0Ugd6SSvqe?LX91Bw$sBeQH3$Dc@`)5 z>1Sw+U4U=)uU6spT6Abc-3hMt+|A=>r6akXrD~ujWz?rA!c}zIwZl{V)Ho(m^;bVI z?(klfai6Wa5d^P05ztD973!#vefQngusr&iW@Wq}sSU<=4H~IR(ae$MKbr+U#_Syd zpp**5k$Qup8o&&GYMAR8p_do+%rvvz!t-?SAPwUf0k$I9HR+Gvm z&$My%aT#tN7J2tHu0bBkr)olJxE~;k|tg%NMI|m4UJBfeL0%mZ_*tcA9!K-il)>@vNiAJ-HYeO?nQE?v+0x~ zZ#{5OOlV7qw-UtX<=r1SU}T~DE!~2aDZT4Ga(y0(rUG+Rj^DfvwyU=dS>PLAdV&>& zt1x2G|;eP*msbT{ZOe`Av8g! z`tpQ$I%&c#Pq{>A^T>)-`p$`+pAIegr|Z&fx8m@cGJEz$73w`RWBCqNp^#T5 zXsYS&z?bA_%hW(evp%fV2&+y_+Ece~`yMU++r4ChPF^Nmzxm>a9|~2G4)C#Yl0@%R z^!8q%wigrt`-qI31S8$2nj$D=bKrqe4im zCgi}+;L+)yN?4nNjgjKpS9zH^vJSd)N)g-LHYX3tRLK4_@&kNA$&UY45qNX->I(idTQB})r{ElP zADUT*E*U+Dh91G6STrFY;x=uKIm(AE?R@A+A~icTvho0T)!bZTx#Pi4~*9CS* zQs6ru1?%U$K=m%+yYfybm}XhoQ(EOE>vBX%xKg1eeIZbOdg?X(dG2={y+z#?h7A}k zjf$d?DW4>kKwk-jOIW=7f{qcLtg1`pZEhOsRu^-#ZOSJxkLZBNmd~Lp{KstNJEWIbzfrcru8(V;`vhaDxHo*VGVrQ{ zZb!A(QL%xoRuSot9(qrA+Q$VhBWFgZE7?DvU;~|>%;#O{fGv0_%*IsBek&wSz9oo! zsqE}Gxtl#hcUIS&Vl=EYg>RQ4;iF-S?PgK^4|mdceQjDyH8+?chwz8zd&7mJ672Im z5i%@1s!{XxlO!kQ`5Q_2Uc~NRW=;MO!27(%I7R!*tV<3Zr=MJq4^s)7Wu3a^Hpr|r z>f$If72V)Y>rR{Ut)S132tXO(9^Hj*+})getz&fg#5a!U1-|XZ=FI|$XvwllC~5R8 zXbE;oMhJWZY>gEC96W!ng&*J8MNw^q_EBw}bAXJql{cxQl%O9224P=BwU&wU6I!7| z$#~u!&@B;wARJWRhv8qWQ?&K$p*;_Bvl+B|^`@cGdAm2a7!DpU>Vh9wg#UVt0$>>M z`w^FgcQ^SB$I-7p?4*(2=m$G2)OLRj)sYLctHCwHv1DL4_WUh%S14r6YnAX^Ql!6r~1Knp{Ou2*t=1EI(Jw5~)FY z5keIRB>|+@fIxsG&h~z5uIM*wX4aZD|K{YZopbg+Pu}v|&*@DE1o@#eGY)ITcO#6S zj;6m*Et@&M3g5`U^M@2V%kUEO59p1mD>zEVK!^_;w-2Zhu<* zu?vrmo%J}&$P;~8DlSHZj*yDK_&4~1Os>D+ZEoR@MDZxsxI9nZhnviN z0b2F^Qt0;Cv{8j*>n@)4GbJ`|%2TqzLT-G?&~#`6@{GE&IdK<{SZO0%Z7y_3Z+SlL zE>j_L>zZu&vwM;S8IrE?+Ts^)p5RH`m%xjJ)C-WzhomWjaJModyus|XK8PZQ1Jp70 zkK+gZjCVF+5$8Gt#JgB^*`ZwHPG+L=^BlDg2lhnA*6qefu@^z{Q{P5Q0SA>JcSryZ zOz$A#k$X~$nT%h#>2?&$(hm3OSO-)Oda%fF^29Dne${v&*LmvL4gupr4}e-D0_vHF z{_I5+(-zF8@ao7v?VR_maPJc1G3TV`PawG6PdC=mH zLj#{$Z@mc7wylMa3(cJxnA3Q^qUNOP6)mQvo5MyAI-}cpC9;ZBA$Y*tY+8fa0}LeK zYWDA_`7pzb{>UAC&7s}beciV|`SE^cW0gZ_ILB0QI6X_AyjP<>5A{LtI_!q`Dgn}( zDo+qbA@EUbLap|oEvN2z@86Sfa-NBN8P zk@z>nWt*e}El+%fJh{GN8}#QrsIuYgKId);Ljqr}y`O%L zd`pN_UM1aL%#HKUgZ^+l_57(*xD!-WW=2JZt!}mU+4zX9t2FD@je3pPvDlQ%7uVmH z91Yffq>4}Gc^s#tB23vO!i(EcJ#kplNsFI{wtnl=R!O{xHst6jcR)A#| z`$7q#d8dQX#nS_QpB&sRB~Lop^FQ3;+Z9A4^GjuUd{hMyUdQi zjtW#RA?J)u8(<~TC76I?fA09K?;y+Ro77ipTvxEqQ${gQodBpQx1&s8s><^=?5*W3 zvZ)q%Q7>SogYbDbzAp2ipiR78BuF;UOsw{=_qmTCirTJ$#RWSUP<;^ukJJ;At&nLw z5NPEmpus`C?ckYYo$wI&?2smgA17G8PE5FD{tgt{GZXNI3pl6Q9LjIr&V>0eb&ln4 zR4lc6_)LrA@RuIssHu=5OH};0M*HI@ar&wUZ$vc&BcL7~k~I5n4LE-+rU?>Ek?pO} zPvTnJj_@)blA7$-vHmQPYdXVZ$`(W&gr`1Is<52&1QcFCGKwps$lD<)o;qM!F~otG zJBDEgP&6hU=-+i=_2n>_ltUaywzA~Dn4ljzoEjz35_zZS54w*nA*gNV4L@tfLs8Wz znin4LsNsLW5d=Gb>l+i=`Y+-$mD_aRQEoNJ^^kMYCF z_g(g%H0gQ*FY!D%iY{47znwM7;rwvZ;_NLy?E##E!&8t9^+OJJ5Uu4-_J+tUMO+W= zOW{c1(e#Yb^Yu_*jIDonJM7Y+eAu!{XF+TOLCpTe`&V+S-3FD2y+wZV0c1vd6x44r z5xXnjs3NQ`=eD%0bX$qgib)5@Q@U{UE*5_1qpD)~6vyeFWuv~8PjcPwYHAsN@Bc6Y zGYV;9%2Vq0tTQr7T9w(#c9q%SH{Q*4{V|zvn~t8F4z7<*G(}IDy4PIPiCXc(;2~Qrob``TZmuLNM1TG${q;_MQEG;Ze97QVeZC(2Brr;_umB+P5?i z{b08%-WxO3hb#Q;I9gC$aDw$bB79hGRsD*PbA*BIj6a0n*YdO_iBk6*L&z+ z0#FaD4S1unODj@R`C;?oKv)u6{Qk7=Di15dG2*PraH^5Uoj+eayWnK9!IwO&JXdOE zcM)zDn4~Edxpp@aAJQpIM1_pLXkh4ir7~!oVR+qz8@gL)XXl=koiaRaq||b`b>D`l z2)I=b2K^jsblF<$OfgY!y9Ukr@sXW-nL$qHMo)8I2p{95c@SfnkbaEY>4|Pn4u6HP zE@UW@rO>|UonmF5O~0m{d}bzfnq|w~WNF=bUfnzd1BHH(TP8qNvbsJ&d@0eJrAz2_ zLVuSHdw({!Ce^Et=<_KbeF;*@TQK{Yjd=|MUm^LOkBwNJY?#e?R88Wz0gKY+tjWZ+ z+!$XdgOVVpaK(n~rtw)1>!;s#s%CQb9j>qWe}k`rgr+OCfeI_s-hT&wDr5rl`) zC%IN6UW!JkKCLm~=vjGXpoFP|am!`U=3rMR5O^{87IZJvDMC-HE@2P#u7h&?&El4= z#Tgeev#Ggae=U9HM8uVjwLdwO3p(lA1Pu0`oRBoJ-%FakOhs6ZYhOguJFA^7^zBfu z;x=2|OjAEQ$Sp)4kzdktx0uRKkaLE!sJb%a_)RH&*EJG|bPF8>l{)k$o`PJ^{xSQ% zSEH`fmF=jr@^VZIw;J(pyk@QidlEj$80`KRsUWSE)_CfRzm&Lb$gl0>Lb85!rtM>osc?ghxBq^4k15o&UTi>Xz)w!_$%K!gG_iJOe_17X&FSUi3@2HdQEpJVt zBi1M@YZbF|znoj8D;uSeYM&ItW%VjH`FZ4gs;qBe@f!#nvL}XM0!hKiLd3%A#X5#M z3-%ix$s;ME?Hhg)uEqZf{(A@FMslxsz)p=6F23lI%5Z>n3p zbtQo{Fb64D1;~;HFFo*}aYAeg@niWefx#{$8IrTdR{T!P zoTn&s5}6!WYL^R|_HUlXapDI`m#frd ze9GS&pdMcn?1_E<`74${Rwf06Js{RaxGR0y&>e% za7n*OL3-WjG%CgO4@E2k4V_TKwz*D&H>&D*t38}6lUB&C)_2Oy23&mW5x!$2l+J~p zFIAM%OGqG7Im?iEyMc*M+(4hXqy8RlilXh^g_@^4`Q_^p3%HSN{SI%bncYDFKkNoE zb1&Nb=72VuAX(DN8%;ZNd@Y<+=W+)jaR)sA>l6*-Nyv)LH>&abNDD|xOM$9`FnM;Y zY*S70$j$tLcYFLAz`>cIQ~Luuw3Ya=kMwVw8n&M;AM<|N+;xlkHVn~LfLr6AHT;M3 z|Ca;K-!?Xs5s(EVmmLmB=$YF^#a25B5IYojSHTu^sBZbnKY)q`3*xB>0<(=GQ!EG> z2bN6~L?L>86U3zV(qCM3ff;v#Dm?4vL`vE-_D6<#SodXPbfqL@et!($U^2wM!&ZP) zcMcD#v(pj`G{qc&n+3Z6-J|T6U$JDc8pgawsRZdq7ezrC^d|oGwyO9SV&ABK{3y_{ zWJ!_x8*}N(G1uYUF0cGVu}n$p&Ap6Kg9GSi1LVFI+}=V{5Ne`V<9%5IU+k2@X=_iH zdZfoYI7gqke72~c#|DSEaA4y)^!FJRb9jFGzHk`9siFRS{b@6u2vxPb`|OfZD-|Zf zul^idg^JKr{L#0f5W&LQdin6kEN_VXs!F3LU$mpNq`aMaC!QwV-0N;c zgPOL+u@{nue{*p>;wgk}q1ph{LfoL(y1+Qp&iyj}l;@7o*;~@dSH2`a477@}TC^~Q z29q9wKEM!O=73`xLl*1LSAH`e|8aWIqe`K2Senx%EatC?s`9GMHj)%G*k#%~2IRYw zpCTSbG~)xcpfuEXb$<;Z8TsgGc_0a_;)q^x0F$B*?(}}6x|;zks?eJjM3FAUM!D)4 zkRscEmvT`AF%L3j?S6?8Gy$ak*e$?M8Ep%(A)0kTRa66{(lpTIW>bEpM78h5O`cJpev>$Msdup@X;CiAaMo+_;)R!kkF$TdNqjhkKNh~(hLDT?F}`M4v*x}(Y!q>(z+O83w28TRtX{nb(D zRqWzivpzoOVlg6d#UVhE!VH@ArOrE*z0Ic=(@dmm)58Uym9Ag2Owwb}W$@jhDvnn; z(|sD(f{m}p^uE$dQc=&hW30@w^0th>YyFX~<85qboKa{>=ZwH3_m5<~ddqkyLuUSo z5kKWfiFx#~^p+_`#*Nq}Q13&e)$CO-rsK@eF@2IX0 z6O*=2ejjeyAYiW4I;pjq2g=Ad@ccItRle1kIMX*w9#m_5RmFJ=#Ys^iF@Cm}jf!`1 zaMzXSHFb8wGM7rqI;M)iRvrJc7+$Tg{?b>--Rf3(-x}j=VSK7(1+#j8&Su#?U32Tl z4KRNZ5dmr#=9d**pP8P=cdm!7udxWgqhP|dq}jqNT>kgVl^t#eL_wUnPD-HA zZsL9&Du^7es+DCye9|^-j7yNAJ$nCz)f=(M!0o)BIaxI{*|X zkgN|YAXKzyCLHr&XvOS8*qGB+?zGYO=ooTHJ_mOXHFSScy?>M9r`zU2AL^w%?2r6T z7c0K0TF%9@@4shMd`Ls(f9dZ6F!AtMZpw9G=GT8)tpab#xL%Jm`w4G6^u<1MDR?R& WpK-f;h}M7cjfal{zLELc=zjnQ6Cf-A literal 0 HcmV?d00001 diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt index 0c48f81..e8a1d66 100644 --- a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt @@ -24,7 +24,9 @@ set(src_list ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_order_preserved_executor.cc ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_aiv_deter_executor.cc ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_aiv_deter_small_executor.cc - + ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_small_all_reduce_mesh_executor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_medium_all_reduce_mesh_executor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_huge_all_reduce_mesh_executor.cc ) target_sources(hccl_alg PRIVATE diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc new file mode 100644 index 0000000..a7f0de8 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "coll_custom_huge_all_reduce_mesh_executor.h" + +namespace hccl { +CollCustomHugeAllReduceMeshExecutor::CollCustomHugeAllReduceMeshExecutor(const HcclDispatcher dispatcher, + std::unique_ptr &topoMatcher) + : CollCommExecutor(dispatcher, topoMatcher) +{ +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize) +{ + // 计算所需要申请的 Scratch 内存大小 + // TODO: 选手可根据算法需要自行修改 + scratchMemSize = 0U; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u", + scratchMemSize); + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum) +{ + // 计算所需要申请的 Stream 数量 + // TODO: 选手可根据算法需要自行修改 + u32 totalStreamNum = topoAttr_.deviceNumPerAggregation; + streamNum = totalStreamNum - 1U; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) +{ + // 计算所需要申请的 Notify 数量 + // TODO: 选手可根据算法需要自行修改 + notifyNum = 2U * streamNum; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::CalcCommInfo(std::vector &opTransport) +{ + // 计算通信域信息 + // TODO: 选手可根据算法需要自行修改 + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcNotifyNum]"); + + // CCL_Input -> CCL_Output + TransportMemType inputType = TransportMemType::CCL_INPUT; + TransportMemType outputType = TransportMemType::CCL_OUTPUT; + // 建立 Mesh 链路 + CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH); + // 构造一级通信域资源请求 + // 最终将调用:CalcMeshTransportReq::CalcTransportRequest() + CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType)); + return HCCL_SUCCESS; +} + +u64 CollCustomHugeAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize) +{ + // 计算循环处理的迭代次数 + // TODO: 选手可根据算法需要自行修改 + + u64 maxCountPerLoop = cclBuffSize / unitSize; + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u", + maxCountPerLoop); + return maxCountPerLoop; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::Orchestrate(OpParam ¶m, AlgResourceResponse &algRes) +{ + // 算法编排总入口 + // TODO: 选手可根据算法需要自行修改 + + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count); + tag_ = param.tag; + algResResp_ = &algRes; + + // User_Input 和 User_Output 指针 + u8 *userInputPtr = static_cast(param.inputPtr); + u8 *userOutputPtr = static_cast(param.outputPtr); + CHK_PTR_NULL(userInputPtr); + CHK_PTR_NULL(userOutputPtr); + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize); + + // 循环处理数据 + for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) { + curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft; + u64 curSize = curCount * unitSize; // curSize 为三种数据量:512K/2M/64M + + // 构造本次循环所使用的内存信息 + ExecMem execMem; + execMem.count = curCount; // 本次循环处理的数据量 + execMem.inputPtr = userInputPtr + inputOffset; // 本次循环使用的 User_Input 内存指针 + execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针 + execMem.inputMem = algRes.cclInputMem; // 本端的 CCL_Input 内存 + execMem.outputMem = algRes.cclOutputMem; // 本端的 CCL_Output 内存 + execMem.scratchMem = algRes.scratchMem; // 本端的 Scratch 内存 + + // 处理本次循环 + CHK_RET(KernelRun(param, execMem)); + + // 更新偏移量 + countLeft -= curCount; + inputOffset = curSize; + outputOffset = curSize; + } + return HCCL_SUCCESS; +} + +HcclResult CollCustomHugeAllReduceMeshExecutor::KernelRun(const OpParam ¶m, ExecMem &execMem) +{ + // 处理单次循环的数据 + // TODO: 选手可根据算法需要自行修改 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数 + u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小,三种数据量:512K/2m/64m,单位:字节 + hccl::Stream &masterStream = const_cast(param.stream); // 主流 + + // TODO: 流同步 + + CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1)); + SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0); + HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u", + level0CommInfo.localRank, level0CommInfo.localRankSize); + + // TODO: 搬运数据 + + return HCCL_SUCCESS; +} + +REGISTER_EXEC("CustomHugeAllReduceMeshExecutor", CustomHugeAllReduceMesh, CollCustomHugeAllReduceMeshExecutor); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h new file mode 100644 index 0000000..707a5d6 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef COLL_CUSTOM_HUGE_ALLREDUCE_MESH_EXECUTOR_H +#define COLL_CUSTOM_HUGE_ALLREDUCE_MESH_EXECUTOR_H + +#include "coll_comm_executor.h" + +namespace hccl { +class CollCustomHugeAllReduceMeshExecutor : public CollCommExecutor { +public: + CollCustomHugeAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + ~CollCustomHugeAllReduceMeshExecutor() = default; + +private: + /* *************** 资源计算 *************** */ + HcclResult CalcScratchMemSize(u64 &scratchMemSize) override; + HcclResult CalcStreamNum(u32 &streamNum) override; + HcclResult CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) override; + HcclResult CalcCommInfo(std::vector &opTransport) override; + u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize); + + /* *************** 算法编排 *************** */ + HcclResult Orchestrate(OpParam ¶m, AlgResourceResponse &algRes); + HcclResult KernelRun(const OpParam ¶m, ExecMem &execMem) override; +}; +} // namespace hccl + +#endif diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc new file mode 100644 index 0000000..6647cfc --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "coll_custom_medium_all_reduce_mesh_executor.h" + +namespace hccl { +CollCustomMediumAllReduceMeshExecutor::CollCustomMediumAllReduceMeshExecutor(const HcclDispatcher dispatcher, + std::unique_ptr &topoMatcher) + : CollCommExecutor(dispatcher, topoMatcher) +{ +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize) +{ + // 计算所需要申请的 Scratch 内存大小 + // TODO: 选手可根据算法需要自行修改 + scratchMemSize = 0U; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u", + scratchMemSize); + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum) +{ + // 计算所需要申请的 Stream 数量 + // TODO: 选手可根据算法需要自行修改 + u32 totalStreamNum = topoAttr_.deviceNumPerAggregation; + streamNum = totalStreamNum - 1U; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) +{ + // 计算所需要申请的 Notify 数量 + // TODO: 选手可根据算法需要自行修改 + notifyNum = 2U * streamNum; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::CalcCommInfo(std::vector &opTransport) +{ + // 计算通信域信息 + // TODO: 选手可根据算法需要自行修改 + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcNotifyNum]"); + + // CCL_Input -> CCL_Output + TransportMemType inputType = TransportMemType::CCL_INPUT; + TransportMemType outputType = TransportMemType::CCL_OUTPUT; + // 建立 Mesh 链路 + CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH); + // 构造一级通信域资源请求 + // 最终将调用:CalcMeshTransportReq::CalcTransportRequest() + CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType)); + return HCCL_SUCCESS; +} + +u64 CollCustomMediumAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize) +{ + // 计算循环处理的迭代次数 + // TODO: 选手可根据算法需要自行修改 + + u64 maxCountPerLoop = cclBuffSize / unitSize; + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u", + maxCountPerLoop); + return maxCountPerLoop; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::Orchestrate(OpParam ¶m, AlgResourceResponse &algRes) +{ + // 算法编排总入口 + // TODO: 选手可根据算法需要自行修改 + + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count); + tag_ = param.tag; + algResResp_ = &algRes; + + // User_Input 和 User_Output 指针 + u8 *userInputPtr = static_cast(param.inputPtr); + u8 *userOutputPtr = static_cast(param.outputPtr); + CHK_PTR_NULL(userInputPtr); + CHK_PTR_NULL(userOutputPtr); + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize); + + // 循环处理数据 + for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) { + curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft; + u64 curSize = curCount * unitSize; // curSize 为三种数据量:512K/2M/64M + + // 构造本次循环所使用的内存信息 + ExecMem execMem; + execMem.count = curCount; // 本次循环处理的数据量 + execMem.inputPtr = userInputPtr + inputOffset; // 本次循环使用的 User_Input 内存指针 + execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针 + execMem.inputMem = algRes.cclInputMem; // 本端的 CCL_Input 内存 + execMem.outputMem = algRes.cclOutputMem; // 本端的 CCL_Output 内存 + execMem.scratchMem = algRes.scratchMem; // 本端的 Scratch 内存 + + // 处理本次循环 + CHK_RET(KernelRun(param, execMem)); + + // 更新偏移量 + countLeft -= curCount; + inputOffset = curSize; + outputOffset = curSize; + } + return HCCL_SUCCESS; +} + +HcclResult CollCustomMediumAllReduceMeshExecutor::KernelRun(const OpParam ¶m, ExecMem &execMem) +{ + // 处理单次循环的数据 + // TODO: 选手可根据算法需要自行修改 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数 + u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小,三种数据量:512K/2m/64m,单位:字节 + hccl::Stream &masterStream = const_cast(param.stream); // 主流 + + // TODO: 流同步 + + CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1)); + SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0); + HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u", + level0CommInfo.localRank, level0CommInfo.localRankSize); + + // TODO: 搬运数据 + + return HCCL_SUCCESS; +} + +REGISTER_EXEC("CustomMediumAllReduceMeshExecutor", CustomMediumAllReduceMesh, CollCustomMediumAllReduceMeshExecutor); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h new file mode 100644 index 0000000..cfa4d4b --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef COLL_CUSTOM_MEDIUM_ALLREDUCE_MESH_EXECUTOR_H +#define COLL_CUSTOM_MEDIUM_ALLREDUCE_MESH_EXECUTOR_H + +#include "coll_comm_executor.h" + +namespace hccl { +class CollCustomMediumAllReduceMeshExecutor : public CollCommExecutor { +public: + CollCustomMediumAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + ~CollCustomMediumAllReduceMeshExecutor() = default; + +private: + /* *************** 资源计算 *************** */ + HcclResult CalcScratchMemSize(u64 &scratchMemSize) override; + HcclResult CalcStreamNum(u32 &streamNum) override; + HcclResult CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) override; + HcclResult CalcCommInfo(std::vector &opTransport) override; + u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize); + + /* *************** 算法编排 *************** */ + HcclResult Orchestrate(OpParam ¶m, AlgResourceResponse &algRes); + HcclResult KernelRun(const OpParam ¶m, ExecMem &execMem) override; +}; +} // namespace hccl + +#endif diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc new file mode 100644 index 0000000..7aebc74 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "coll_custom_small_all_reduce_mesh_executor.h" + +namespace hccl { +CollCustomSmallAllReduceMeshExecutor::CollCustomSmallAllReduceMeshExecutor(const HcclDispatcher dispatcher, + std::unique_ptr &topoMatcher) + : CollCommExecutor(dispatcher, topoMatcher) +{ +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize) +{ + // 计算所需要申请的 Scratch 内存大小 + // TODO: 选手可根据算法需要自行修改 + scratchMemSize = 0U; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u", + scratchMemSize); + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum) +{ + // 计算所需要申请的 Stream 数量 + // TODO: 选手可根据算法需要自行修改 + u32 totalStreamNum = topoAttr_.deviceNumPerAggregation; + streamNum = totalStreamNum - 1U; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) +{ + // 计算所需要申请的 Notify 数量 + // TODO: 选手可根据算法需要自行修改 + notifyNum = 2U * streamNum; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum); + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::CalcCommInfo(std::vector &opTransport) +{ + // 计算通信域信息 + // TODO: 选手可根据算法需要自行修改 + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcNotifyNum]"); + + // CCL_Input -> CCL_Output + TransportMemType inputType = TransportMemType::CCL_INPUT; + TransportMemType outputType = TransportMemType::CCL_OUTPUT; + // 建立 Mesh 链路 + CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH); + // 构造一级通信域资源请求 + // 最终将调用:CalcMeshTransportReq::CalcTransportRequest() + CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType)); + return HCCL_SUCCESS; +} + +u64 CollCustomSmallAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize) +{ + // 计算循环处理的迭代次数 + // TODO: 选手可根据算法需要自行修改 + + u64 maxCountPerLoop = cclBuffSize / unitSize; + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u", + maxCountPerLoop); + return maxCountPerLoop; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::Orchestrate(OpParam ¶m, AlgResourceResponse &algRes) +{ + // 算法编排总入口 + // TODO: 选手可根据算法需要自行修改 + + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count); + tag_ = param.tag; + algResResp_ = &algRes; + + // User_Input 和 User_Output 指针 + u8 *userInputPtr = static_cast(param.inputPtr); + u8 *userOutputPtr = static_cast(param.outputPtr); + CHK_PTR_NULL(userInputPtr); + CHK_PTR_NULL(userOutputPtr); + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize); + + // 循环处理数据 + for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) { + curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft; + u64 curSize = curCount * unitSize; // curSize 为三种数据量:512K/2M/64M + + // 构造本次循环所使用的内存信息 + ExecMem execMem; + execMem.count = curCount; // 本次循环处理的数据量 + execMem.inputPtr = userInputPtr + inputOffset; // 本次循环使用的 User_Input 内存指针 + execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针 + execMem.inputMem = algRes.cclInputMem; // 本端的 CCL_Input 内存 + execMem.outputMem = algRes.cclOutputMem; // 本端的 CCL_Output 内存 + execMem.scratchMem = algRes.scratchMem; // 本端的 Scratch 内存 + + // 处理本次循环 + CHK_RET(KernelRun(param, execMem)); + + // 更新偏移量 + countLeft -= curCount; + inputOffset = curSize; + outputOffset = curSize; + } + return HCCL_SUCCESS; +} + +HcclResult CollCustomSmallAllReduceMeshExecutor::KernelRun(const OpParam ¶m, ExecMem &execMem) +{ + // 处理单次循环的数据 + // TODO: 选手可根据算法需要自行修改 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数 + u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小,三种数据量:512K/2m/64m,单位:字节 + hccl::Stream &masterStream = const_cast(param.stream); // 主流 + + // TODO: 流同步 + + CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1)); + SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0); + HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u", + level0CommInfo.localRank, level0CommInfo.localRankSize); + + // TODO: 搬运数据 + + return HCCL_SUCCESS; +} + +REGISTER_EXEC("CustomSmallAllReduceMeshExecutor", CustomSmallAllReduceMesh, CollCustomSmallAllReduceMeshExecutor); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h new file mode 100644 index 0000000..f373a0e --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef COLL_CUSTOM_SMALL_ALLREDUCE_MESH_EXECUTOR_H +#define COLL_CUSTOM_SMALL_ALLREDUCE_MESH_EXECUTOR_H + +#include "coll_comm_executor.h" + +namespace hccl { +class CollCustomSmallAllReduceMeshExecutor : public CollCommExecutor { +public: + CollCustomSmallAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + ~CollCustomSmallAllReduceMeshExecutor() = default; + +private: + /* *************** 资源计算 *************** */ + HcclResult CalcScratchMemSize(u64 &scratchMemSize) override; + HcclResult CalcStreamNum(u32 &streamNum) override; + HcclResult CalcNotifyNum(u32 streamNum, u32 ¬ifyNum) override; + HcclResult CalcCommInfo(std::vector &opTransport) override; + u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize); + + /* *************** 算法编排 *************** */ + HcclResult Orchestrate(OpParam ¶m, AlgResourceResponse &algRes); + HcclResult KernelRun(const OpParam ¶m, ExecMem &execMem) override; +}; +} // namespace hccl + +#endif diff --git a/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt b/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt index ccf812f..b1726cd 100644 --- a/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt +++ b/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt @@ -13,6 +13,7 @@ set(src_list ${CMAKE_CURRENT_SOURCE_DIR}/send_operator.cc ${CMAKE_CURRENT_SOURCE_DIR}/receive_operator.cc ${CMAKE_CURRENT_SOURCE_DIR}/batch_write_operator.cc + ${CMAKE_CURRENT_SOURCE_DIR}/custom_all_reduce_operator.cc ) target_sources(hccl_alg PRIVATE diff --git a/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc b/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc index 626018d..76bfd01 100644 --- a/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc +++ b/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc @@ -599,6 +599,6 @@ HcclResult AllReduceOperator::SelectAlgfor91093(const OpParam& param, std::strin return HCCL_SUCCESS; } -REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, AllReduceOperator); +// REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, AllReduceOperator); } diff --git a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc new file mode 100644 index 0000000..cd73e75 --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "custom_all_reduce_operator.h" +#include "coll_alg_op_registry.h" + +namespace hccl { + +CustomAllReduceOperator::CustomAllReduceOperator(AlgConfigurator *algConfigurator, CCLBufferManager &cclBufferManager, + HcclDispatcher dispatcher, std::unique_ptr &topoMatcher) + : CollAlgOperator(algConfigurator, cclBufferManager, dispatcher, topoMatcher, HcclCMDType::HCCL_CMD_ALLREDUCE) +{ +} + +CustomAllReduceOperator::~CustomAllReduceOperator() {} + +HcclResult CustomAllReduceOperator::SelectAlg(const std::string &tag, const OpParam ¶m, std::string &algName, + std::string &newTag) +{ + constexpr u64 HCCL_CONTEST_SMALL_COUNT_KB = 512 * 1024; // 512KB + constexpr u64 HCCL_CONTEST_MEDIUM_COUNT_KB = 2 * 1024 * 1024; // 2MB + constexpr u64 HCCL_CONTEST_HUGE_COUNT_KB = 64 * 1024 * 1024; // 64MB + + // 算法选择逻辑 + // TODO: 选手可根据数据量大小选择合适的 Executor + // 注意: + // 1. 相同算法在不同数据量下的性能不同 + // 2. 选手可以先只实现一个 Executor,算法选择时直接设置 algName 为该 Executor 的名字 + + u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; + u64 dataSize = param.DataDes.count * unitSize; // 单位:字节,三种数据量:512K/2M/64M + if (dataSize <= HCCL_CONTEST_SMALL_COUNT_KB) { + algName = "CustomSmallAllReduceMeshExecutor"; + } else if (dataSize <= HCCL_CONTEST_MEDIUM_COUNT_KB) { + algName = "CustomMediumAllReduceMeshExecutor"; + } else { + algName = "CustomHugeAllReduceMeshExecutor"; + } + return HCCL_SUCCESS; +} + +// 注册算子 +REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, CustomAllReduceOperator); +} // namespace hccl diff --git a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h new file mode 100644 index 0000000..41ae73d --- /dev/null +++ b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef CUSTOM_ALL_REDUCE_OPERATOR_H +#define CUSTOM_ALL_REDUCE_OPERATOR_H + +#include "coll_alg_operator.h" + +namespace hccl { +// 数据规模分类 +enum class HcclDataCountType { HCCL_COUNT_SMALL = 0, HCCL_COUNT_MEDIUM, HCCL_COUNT_HUGE, HCCL_COUNT_RESERVED }; + +class CustomAllReduceOperator : public CollAlgOperator { +public: + CustomAllReduceOperator(AlgConfigurator *algConfigurator, CCLBufferManager &cclBufferManager, + HcclDispatcher dispatcher, std::unique_ptr &topoMatcher); + + ~CustomAllReduceOperator(); + + HcclResult SelectAlg(const std::string &tag, const OpParam ¶m, std::string &algName, + std::string &newTag) override; +}; +} // namespace hccl +#endif diff --git a/submit.sh b/submit.sh new file mode 100755 index 0000000..655e94b --- /dev/null +++ b/submit.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e + +# bash fonts colors +red='\e[31m' +yellow='\e[33m' +green='\e[92m' +none='\e[0m' + +error() { echo -e "${red}$*${none}" && exit 1; } +warning() { echo -e "${yellow}$*${none}"; } +info() { echo -e "${green}$*${none}"; } + +src_dir="/workspace/cann-hccl" +dst_dir="/result" + +operator_dir="src/domain/collective_communication/algorithm/impl/operator" +executor_dir="src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce" + +files=( + "${operator_dir}/custom_all_reduce_operator.h" + "${operator_dir}/custom_all_reduce_operator.cc" + "${executor_dir}/coll_custom_small_all_reduce_mesh_executor.h" + "${executor_dir}/coll_custom_small_all_reduce_mesh_executor.cc" + "${executor_dir}/coll_custom_medium_all_reduce_mesh_executor.h" + "${executor_dir}/coll_custom_medium_all_reduce_mesh_executor.cc" + "${executor_dir}/coll_custom_huge_all_reduce_mesh_executor.h" + "${executor_dir}/coll_custom_huge_all_reduce_mesh_executor.cc" +) + +for file in "${files[@]}"; do + file_path="${src_dir}/${file}" + if [ -f "${file_path}" ]; then + cp -i "${file_path}" "${dst_dir}" + info "Copied: ${file_path} to ${dst_dir}" + else + error "No such file: ${file_path}" + fi +done + +info "All files copied successfully to ${dst_dir}" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bbd859a..8475b05 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -418,6 +418,7 @@ set(src_list_alg ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/registry/coll_alg_op_registry.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/coll_alg_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_reduce_operator.cc + ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/custom_all_reduce_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_gather_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_gather_v_operator.cc ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/reduce_scatter_operator.cc @@ -477,8 +478,7 @@ target_compile_options(hccl_alg_test PRIVATE -fno-strict-aliasing -pipe -std=c++14 - -Os - -O2 + -O0 -g -fstack-protector-all $<$:-fsanitize=address -fsanitize-recover=address,all -fno-omit-frame-pointer -g> ) @@ -521,9 +521,10 @@ add_custom_target(hccl_alg_test_lib COMMAND cd ${CMAKE_INSTALL_PREFIX}/hccl_lib ) -add_custom_command(TARGET hccl_alg_test POST_BUILD - COMMAND ${CMAKE_STRIP} $ -) +# 禁用 strip +# add_custom_command(TARGET hccl_alg_test POST_BUILD +# COMMAND ${CMAKE_STRIP} $ +# ) install(TARGETS hccl_alg_test LIBRARY DESTINATION lib OPTIONAL diff --git a/test/algorithm/testcase/main.cc b/test/algorithm/testcase/main.cc index db03489..74d7e92 100644 --- a/test/algorithm/testcase/main.cc +++ b/test/algorithm/testcase/main.cc @@ -2,7 +2,7 @@ GTEST_API_ int main(int argc, char **argv) { // testcase调试代码,只跑特定的用例 - //testing::GTEST_FLAG(filter) = "AllReduceTest.allreduce_cyw_test"; + testing::GTEST_FLAG(filter) = "AllReduceTest.allreduce_contest_test*"; testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/test/algorithm/testcase/testcase_all_reduce.cc b/test/algorithm/testcase/testcase_all_reduce.cc index 7dc31ad..9f87d77 100644 --- a/test/algorithm/testcase/testcase_all_reduce.cc +++ b/test/algorithm/testcase/testcase_all_reduce.cc @@ -1751,4 +1751,394 @@ TEST_F(AllReduceTest, allreduce_aiv_determinstic_test) ret = checker.Check(checkerOpParam, topoMeta); // EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); -} \ No newline at end of file +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 512k + int8 + u64 size = 512 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 2m + int8 + u64 size = 2 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 64m + int8 + u64 size = 64 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 1g + int8 + u64 size = 1 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_int8) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 4g + int8 + u64 size = 4 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_INT8; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 512k + fp16 + u64 size = 512 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 2m + fp16 + u64 size = 2 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 64m + fp16 + u64 size = 64 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 1g + fp16 + u64 size = 1 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp16) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 4g + fp16 + u64 size = 4 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP16; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 512k + fp32 + u64 size = 512 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 2m + fp32 + u64 size = 2 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 64m + fp32 + u64 size = 64 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 1g + fp32 + u64 size = 1 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} + +TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp32) +{ + // 拓扑:单机 8 卡 + RankTable_For_LLT gen; + TopoMeta topoMeta; + gen.GenTopoMeta(topoMeta, 1, 1, 8); + + // 4g + fp32 + u64 size = 4 * 1024 * 1024 * 1024; + auto dataType = CheckerDataType::DATA_TYPE_FP32; + CheckerOpParam checkerOpParam; + checkerOpParam.opType = CheckerOpType::ALLREDUCE; + checkerOpParam.tag = "AllReduce"; + checkerOpParam.opMode = CheckerOpMode::OPBASE; + checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType]; + checkerOpParam.DataDes.dataType = dataType; + checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B; + + Checker checker; + HcclResult ret; + checker.CloseRankMemCheck(); + checker.EnableTaskPrint(); + ret = checker.Check(checkerOpParam, topoMeta); + EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS); +} -- Gitee From 0de11bc67d74e56eca10eb633ad2906ac3290c83 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Fri, 18 Jul 2025 14:20:48 +0000 Subject: [PATCH 3/7] =?UTF-8?q?!71=20[HCCL=E7=AB=9E=E8=B5=9B]=20=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E5=8F=82=E8=B5=9B=E6=8C=87=E5=AF=BC=E6=96=87=E6=A1=A3?= =?UTF-8?q?=20*=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contest.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contest.md b/contest.md index 26b6e9f..ed14660 100644 --- a/contest.md +++ b/contest.md @@ -13,7 +13,8 @@ HCCL 资料: -- [昇腾社区官网][1腾社区][2] +- [昇腾社区官网][1] +- [HCCL主页——昇腾社区][2] - [HCCL概述——昇腾社区][3] - [集合通信原语——昇腾社区][4] - [HCCL代码仓][5] -- Gitee From 66d90510a04e37122036bac7fac5e4f43d23ea70 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Sun, 20 Jul 2025 10:58:27 +0000 Subject: [PATCH 4/7] =?UTF-8?q?!72=20[HCCL=E7=AB=9E=E8=B5=9B]=20=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E8=B5=9B=E9=A2=98=E8=B5=84=E6=96=99=20Merge=20pull=20?= =?UTF-8?q?request=20!72=20from=20Yuanhao=20Ji/contest-4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 142 ++++++------------ contest.md | 48 +++--- .../operator/custom_all_reduce_operator.cc | 10 +- submit.sh | 2 +- 4 files changed, 76 insertions(+), 126 deletions(-) diff --git a/Dockerfile b/Dockerfile index ae431ab..f853e97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,20 @@ # NOTE: Building this image requires docker version >= 18.0 -ARG TARGETPLATFORM=linux/arm64 ARG BASE_IMAGE=ubuntu:22.04 -ARG PYTHON_VERSION=3.10 -# 阶段 1:安装依赖 -FROM ${BASE_IMAGE} AS base +FROM ${BASE_IMAGE} AS official + +ARG TARGETPLATFORM=linux/arm64 + +ENV USER_PASSWD=change_me + +SHELL [ "/bin/bash", "-c" ] + +RUN cp /etc/apt/sources.list /etc/apt/sources.list.backup && \ + case ${TARGETPLATFORM} in \ + "linux/arm64") sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list ;; \ + *) sed -i 's|archive.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list ;; \ + esac RUN apt-get update \ && apt-get install --no-install-recommends -y \ @@ -20,6 +29,13 @@ RUN apt-get update \ g++ \ make \ cmake \ + python3 \ + python3-pip \ + gdb \ + vim \ + file \ + man \ + sudo \ zlib1g \ openssl \ unzip \ @@ -40,39 +56,25 @@ RUN apt-get update \ libgdbm-dev \ liblzma-dev \ libev-dev \ + openssh-server \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /var/tmp/* \ && rm -rf /tmp/* -# 阶段 2:安装 Conda -FROM base AS conda-installer +# 创建 hccl 用户 +RUN groupadd -g 1000 hcclgroup && \ + useradd -u 1000 -g hcclgroup -ms /bin/bash hccluser && \ + usermod -aG sudo hccluser && \ + echo "hccluser ALL=(ALL) NOPASSWD:/usr/bin/apt-get,/usr/bin/apt" >> /etc/sudoers -ARG TARGETPLATFORM -ARG PYTHON_VERSION - -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") ARCH=aarch64 ;; \ - *) ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -o /tmp/miniconda.sh -O "https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-${ARCH}.sh" - -RUN chmod +x /tmp/miniconda.sh && \ - bash /tmp/miniconda.sh -b -p /opt/conda && \ - rm /tmp/miniconda.sh && \ - /opt/conda/bin/conda install -y python=${PYTHON_VERSION} && \ - /opt/conda/bin/conda clean -ya - -# 阶段 3:安装 CANN 8.2.RC1.alpha003 -FROM conda-installer AS cann-installer - -ARG TARGETPLATFORM -ENV PATH=/opt/conda/bin:${PATH} +USER hccluser RUN pip install --no-cache-dir -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \ attrs cython numpy==1.24.0 decorator sympy cffi pyyaml pathlib2 \ psutil protobuf==3.20 scipy requests absl-py +# 安装 CANN 8.2.RC1.alpha003 RUN case ${TARGETPLATFORM} in \ "linux/arm64") ARCH=aarch64 ;; \ *) ARCH=x86_64 ;; \ @@ -82,112 +84,58 @@ RUN case ${TARGETPLATFORM} in \ CANN_COMMUNITY_SDK_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-communitysdk_8.2.RC1.alpha003_linux-${ARCH}.run" && \ curl -fsSL -o /tmp/Ascend-cann-communitysdk.run -O "${CANN_COMMUNITY_SDK_URL}" -# 安装 CANN Toolkit RUN chmod +x /tmp/Ascend-cann-toolkit.run && \ /tmp/Ascend-cann-toolkit.run --quiet --install && \ rm /tmp/Ascend-cann-toolkit.run -# 安装 Community SDK RUN chmod +x /tmp/Ascend-cann-communitysdk.run && \ /tmp/Ascend-cann-communitysdk.run --quiet --full && \ rm /tmp/Ascend-cann-communitysdk.run -# 阶段 4:下载 HCCL 仓库及其依赖 -FROM cann-installer AS hccl-installer - -WORKDIR /workspace - +# 安装 HCCL 依赖 RUN curl -fsSL -o /tmp/include.zip -O https://github.com/nlohmann/json/releases/download/v3.11.2/include.zip && \ - unzip -d /workspace/nlohmann_json /tmp/include.zip && \ + unzip -d ${HOME}/nlohmann_json /tmp/include.zip && \ rm /tmp/include.zip -# 安装 MPI RUN curl -fsSL -o /tmp/mpich.tar.gz -O https://www.mpich.org/static/downloads/3.2.1/mpich-3.2.1.tar.gz && \ - tar -zxf /tmp/mpich.tar.gz -C /workspace && \ - cd /workspace/mpich-3.2.1 && \ - ./configure --disable-fortran --prefix=/workspace/mpich --with-device=ch3:nemesis && \ + tar -zxf /tmp/mpich.tar.gz -C /tmp && \ + cd /tmp/mpich-3.2.1 && \ + ./configure --disable-fortran --prefix=${HOME}/mpich --with-device=ch3:nemesis && \ make && make install && \ - rm -r /workspace/mpich-3.2.1 && \ + rm -r /tmp/mpich-3.2.1 && \ rm /tmp/mpich.tar.gz # 设置环境变量 RUN \ - # Conda 环境变量 - echo 'export PATH=/opt/conda/bin:${PATH}' >> /root/.bashrc && \ # NPU 驱动环境变量 - echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:${LD_LIBRARY_PATH}' >> /root/.bashrc && \ - echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:${LD_LIBRARY_PATH}' >> /root/.bashrc && \ + echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc && \ + echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc && \ # CANN Toolkit 环境变量 - echo 'source /usr/local/Ascend/ascend-toolkit/set_env.sh' >> /root/.bashrc && \ + echo 'source ${HOME}/Ascend/ascend-toolkit/set_env.sh' >> ${HOME}/.bashrc && \ # MPICH 环境变量 - echo 'export PATH=/workspace/mpich/bin:${PATH}' >> /root/.bashrc && \ - echo 'export LD_LIBRARY_PATH=/workspace/mpich/lib:${LD_LIBRARY_PATH}' >> /root/.bashrc - -# 阶段 5:安装 SSH -FROM base AS ssh-installer + echo 'export PATH=${HOME}/mpich/bin:${PATH}' >> ${HOME}/.bashrc && \ + echo 'export LD_LIBRARY_PATH=${HOME}/mpich/lib:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - openssh-server +USER root # SSH 配置 RUN echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ - echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \ + echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \ echo "PermitUserEnvironment yes" >> /etc/ssh/sshd_config && \ echo "ClientAliveInterval 60" >> /etc/ssh/sshd_config && \ - echo "ClientAliveCountMax 3" >> /etc/ssh/sshd_config + echo "ClientAliveCountMax 3" >> /etc/ssh/sshd_config && \ + echo "AllowUsers hccluser" >> /etc/ssh/sshd_config # SSH 启动脚本 RUN echo '#!/bin/bash' > /start.sh && \ - echo 'if [ -n "${ROOT_PASSWD}" ]; then' >> /start.sh && \ - echo ' echo "root:${ROOT_PASSWD}" | chpasswd' >> /start.sh && \ + echo 'if [ -n "${USER_PASSWD}" ]; then' >> /start.sh && \ + echo ' echo "hccluser:${USER_PASSWD}" | chpasswd' >> /start.sh && \ echo 'fi' >> /start.sh && \ echo 'mkdir -p /var/run/sshd' >> /start.sh && \ echo 'ssh-keygen -A' >> /start.sh && \ echo '/usr/sbin/sshd -D -e' >> /start.sh && \ chmod +x /start.sh -# 最终阶段:安装运行所需依赖,复制前面阶段结果 -FROM ${BASE_IMAGE} AS official - -ENV ROOT_PASSWD=change_me - -SHELL [ "/bin/bash", "-c" ] - -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - apt-transport-https \ - ca-certificates \ - bash \ - libc6 \ - libsqlite3-dev \ - git \ - gcc \ - g++ \ - gdb \ - make \ - cmake \ - file \ - vim \ - netcat \ - curl \ - wget \ - openssh-server \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /var/tmp/* \ - && rm -rf /tmp/* - -COPY --from=cann-installer /opt/conda /opt/conda -COPY --from=cann-installer /etc/Ascend /etc/Ascend -COPY --from=cann-installer /usr/local/Ascend /usr/local/Ascend -COPY --from=hccl-installer /root/.bashrc /root/.bashrc -COPY --from=hccl-installer /workspace /workspace -COPY --from=ssh-installer /etc/ssh/sshd_config /etc/ssh/sshd_config -COPY --from=ssh-installer /start.sh /start.sh - EXPOSE 22 -WORKDIR /workspace - CMD [ "/start.sh" ] diff --git a/contest.md b/contest.md index ed14660..d4a35e8 100644 --- a/contest.md +++ b/contest.md @@ -69,13 +69,13 @@ ssh root@ip -p port |-- /dev | |-- davinci1 # NPU1 | `-- davinci2 # NPU2 -|-- /etc/Ascend -| `-- ascend_cann_install.info # CANN 安装信息 |-- /usr/local/Ascend -| |-- ascend-toolkit # CANN Toolkit 安装目录 | `-- driver # NPU 驱动安装目录 -`-- /workspace - |-- cann-hccl # HCCL 代码仓,选手需自行下载 +|-- /home/hccluser/Ascend +| |-- ascend-toolkit # CANN Toolkit 安装目录 +| `-- ascend_cann_install.info # CANN 安装信息 +`-- /home/hccluser + |-- cann-hccl # HCCL 代码仓(选手需自行下载) |-- mpich # MPICH 安装目录 `-- nlohmann_json # nlohmann json inclue 目录 ``` @@ -102,7 +102,7 @@ ssh root@ip -p port > 【注意】选手只需下载 [ascend/cann-hccl](https://gitee.com/ascend/cann-hccl.git) 代码仓即可,编译运行所需全部依赖已提前安装 ```bash -cd /workspace +cd /home/hccluser git clone https://gitee.com/ascend/cann-hccl.git -b r1.5.1 ``` @@ -130,17 +130,17 @@ git clone https://gitee.com/ascend/cann-hccl.git -b r1.5.1 编译所需的依赖项均已安装,在 HCCL 代码仓执行编译即可: ```bash -cd /workspace/cann-hccl +cd /home/hccluser/cann-hccl -bash build.sh --nlohmann_path /workspace/nlohmann_json/include +bash build.sh --nlohmann_path /home/hccluser/nlohmann_json/include ``` ## 6. 安装编译结果 -编译生成的 HCCL 软件包在 `/workspace/cann-hccl/output` 目录下: +编译生成的 HCCL 软件包在 `/home/hccluser/cann-hccl/output` 目录下: ```bash -cd /workspace/cann-hccl/output +cd /home/hccluser/cann-hccl/output ./CANN-hccl_alg-8.2.t12.0.b077-linux.aarch64.run ``` @@ -156,10 +156,10 @@ cd /workspace/cann-hccl/output 编译并执行算法分析器用例: ```bash -cd /workspace/cann-hccl +cd /home/hccluser/cann-hccl # 编译测试用例 -bash build.sh --nlohmann_path /workspace/nlohmann_json/include --test --open_hccl_test +bash build.sh --nlohmann_path /home/hccluser/nlohmann_json/include --test --open_hccl_test # 执行测试用例 ./build/test/open_hccl_test @@ -172,18 +172,18 @@ bash build.sh --nlohmann_path /workspace/nlohmann_json/include --test --open_hcc 基于 HCCL Test 工具在 NPU 设备上执行验证: ```bash -cd /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_test +cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test # 编译 HCCL 性能测试工具 -make MPI_HOME=/workspace/mpich ASCEND_DIR=/usr/local/Ascend/ascend-toolkit/latest +make MPI_HOME=/home/hccluser/mpich ASCEND_DIR=/home/hccluser/Ascend/ascend-toolkit/latest # 执行 HCCL Test # 512K -mpirun -n 2 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 2 +mpirun -n 4 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 # 2M -mpirun -n 2 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 2 +mpirun -n 4 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 # 64M -mpirun -n 2 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 2 +mpirun -n 4 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 ``` > 工具详细说明可参考:[昇腾文档中心-HCCL 性能测试工具使用指南](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/devaids/hccltool/HCCLpertest_16_0001.html) @@ -193,7 +193,9 @@ mpirun -n 2 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 2 执行下列脚本,将选手代码拷贝到 `/result` 目录下 ```bash -bash /workspace/cann-hccl/submit.sh +cd /home/hccluser/cann-hccl + +bash submit.sh ``` 该脚本将选手编写的定制算法文件拷贝至 `/result` 目录下,用于后续评测: @@ -242,7 +244,7 @@ export ASCEND_GLOBAL_LOG_LEVEL=1 # 0: debug, 1: info, 2: warn, 3: error 设置日志存储目录: ```bash -export ASCEND_PROCESS_LOG_PATH=/workspace/log # 默认为:$HOME/ascend/log +export ASCEND_PROCESS_LOG_PATH=/home/hccluser/log # 默认为:$HOME/ascend/log ``` 设置日志输出到控制台: @@ -266,14 +268,14 @@ export ASCEND_HOST_LOG_FILE_NUM=1000 > 【注意】选手本地开发编译 HCCL 代码时默认已开启 `-O0 -g` 编译选项,但最终评测时会开启 `-O3` ```bash -cd /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_test +cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test # 512K -gdb --args mpirun -n 2 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 2 +gdb --args mpirun -n 4 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 # 2M -gdb --args mpirun -n 2 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 2 +gdb --args mpirun -n 4 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 # 64M -gdb --args mpirun -n 2 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 2 +gdb --args mpirun -n 4 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 ``` ### 10.3 Wrong answer 问题 diff --git a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc index cd73e75..66dface 100644 --- a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc +++ b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc @@ -24,9 +24,9 @@ CustomAllReduceOperator::~CustomAllReduceOperator() {} HcclResult CustomAllReduceOperator::SelectAlg(const std::string &tag, const OpParam ¶m, std::string &algName, std::string &newTag) { - constexpr u64 HCCL_CONTEST_SMALL_COUNT_KB = 512 * 1024; // 512KB - constexpr u64 HCCL_CONTEST_MEDIUM_COUNT_KB = 2 * 1024 * 1024; // 2MB - constexpr u64 HCCL_CONTEST_HUGE_COUNT_KB = 64 * 1024 * 1024; // 64MB + constexpr u64 HCCL_CONTEST_SMALL_COUNT = 512 * 1024; // 512KB + constexpr u64 HCCL_CONTEST_MEDIUM_COUNT = 2 * 1024 * 1024; // 2MB + constexpr u64 HCCL_CONTEST_HUGE_COUNT = 64 * 1024 * 1024; // 64MB // 算法选择逻辑 // TODO: 选手可根据数据量大小选择合适的 Executor @@ -36,9 +36,9 @@ HcclResult CustomAllReduceOperator::SelectAlg(const std::string &tag, const OpPa u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; u64 dataSize = param.DataDes.count * unitSize; // 单位:字节,三种数据量:512K/2M/64M - if (dataSize <= HCCL_CONTEST_SMALL_COUNT_KB) { + if (dataSize <= HCCL_CONTEST_SMALL_COUNT) { algName = "CustomSmallAllReduceMeshExecutor"; - } else if (dataSize <= HCCL_CONTEST_MEDIUM_COUNT_KB) { + } else if (dataSize <= HCCL_CONTEST_MEDIUM_COUNT) { algName = "CustomMediumAllReduceMeshExecutor"; } else { algName = "CustomHugeAllReduceMeshExecutor"; diff --git a/submit.sh b/submit.sh index 655e94b..b7b0d0a 100755 --- a/submit.sh +++ b/submit.sh @@ -12,7 +12,7 @@ error() { echo -e "${red}$*${none}" && exit 1; } warning() { echo -e "${yellow}$*${none}"; } info() { echo -e "${green}$*${none}"; } -src_dir="/workspace/cann-hccl" +src_dir="/home/hccluser/cann-hccl" dst_dir="/result" operator_dir="src/domain/collective_communication/algorithm/impl/operator" -- Gitee From cf6051654aaca24e2c0053f34e05c22aa2299df4 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Mon, 21 Jul 2025 06:36:44 +0000 Subject: [PATCH 5/7] =?UTF-8?q?!73=20[HCCL=E7=AB=9E=E8=B5=9B]=20=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=B5=9B=E9=A2=98=E6=96=87=E6=A1=A3=20*=20update=20do?= =?UTF-8?q?c?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contest.md | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/contest.md b/contest.md index d4a35e8..6ee60ee 100644 --- a/contest.md +++ b/contest.md @@ -67,8 +67,10 @@ ssh root@ip -p port ``` |-- /dev -| |-- davinci1 # NPU1 -| `-- davinci2 # NPU2 +| |-- davinci0 # NPU1 +| |-- davinci1 # NPU2 +| |-- davinci2 # NPU3 +| `-- davinci3 # NPU4 |-- /usr/local/Ascend | `-- driver # NPU 驱动安装目录 |-- /home/hccluser/Ascend @@ -158,10 +160,12 @@ cd /home/hccluser/cann-hccl/output ```bash cd /home/hccluser/cann-hccl -# 编译测试用例 +# 编译测试用例,并自动执行 bash build.sh --nlohmann_path /home/hccluser/nlohmann_json/include --test --open_hccl_test -# 执行测试用例 +# 手动执行测试用例 +export BUILD_TEST_DIR="/home/hccluser/cann-hccl/build/test/" +export LD_LIBRARY_PATH="${BUILD_TEST_DIR}:${LD_LIBRARY_PATH}" ./build/test/open_hccl_test ``` @@ -265,17 +269,15 @@ export ASCEND_HOST_LOG_FILE_NUM=1000 使用 gdb 调试: -> 【注意】选手本地开发编译 HCCL 代码时默认已开启 `-O0 -g` 编译选项,但最终评测时会开启 `-O3` +> 【注意】编译算法分析器依赖的 HCCL 代码时默认已开启 `-O0 -g` 编译选项 ```bash -cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test +cd /home/hccluser/cann-hccl -# 512K -gdb --args mpirun -n 4 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -# 2M -gdb --args mpirun -n 4 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 -# 64M -gdb --args mpirun -n 4 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 +# 基于算法分析器调试 HCCL 定制算法 +export BUILD_TEST_DIR="/home/hccluser/cann-hccl/build/test/" +export LD_LIBRARY_PATH="${BUILD_TEST_DIR}:${LD_LIBRARY_PATH}" +gdb --args ./build/test/open_hccl_test ``` ### 10.3 Wrong answer 问题 -- Gitee From 867c82cb17e2b48ba2faa78c0e883994f6e724fb Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Wed, 23 Jul 2025 02:37:19 +0000 Subject: [PATCH 6/7] =?UTF-8?q?!74=20[HCCL=E7=AB=9E=E8=B5=9B]=20=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=B5=9B=E9=A2=98=E8=B5=84=E6=96=99=EF=BC=8C=E5=AE=8C?= =?UTF-8?q?=E5=96=84=E6=80=A7=E8=83=BD=E8=AF=84=E6=B5=8B=E5=91=BD=E4=BB=A4?= =?UTF-8?q?=20*=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contest.md | 25 ++++++++++++++++--- .../algorithm/testcase/testcase_all_reduce.cc | 6 ++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/contest.md b/contest.md index 6ee60ee..2179668 100644 --- a/contest.md +++ b/contest.md @@ -183,14 +183,31 @@ make MPI_HOME=/home/hccluser/mpich ASCEND_DIR=/home/hccluser/Ascend/ascend-toolk # 执行 HCCL Test # 512K -mpirun -n 4 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 +mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500 # 2M -mpirun -n 4 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 +mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 -w 100 -n 500 # 64M -mpirun -n 4 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 +mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 -w 100 -n 500 ``` -> 工具详细说明可参考:[昇腾文档中心-HCCL 性能测试工具使用指南](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/devaids/hccltool/HCCLpertest_16_0001.html) +各参数解释如下,详细说明可参考:[昇腾文档中心-HCCL 性能测试工具使用指南][9] + +[9]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/devaids/hccltool/HCCLpertest_16_0001.html + +```bash +mpirun -np 4 \ # MPI 进程数量 + taskset -c 0,2,4,6 \ # 将 MPI 进程绑定到 0,2,4,6 CPU 核(设置 CPU 亲和性,避免操作系统调度干扰,降低波动) + ./bin/all_reduce_test \ # 可执行文件路径 + -b 512k \ # 测试数据大小的最小值,单位:Byte + -e 512k \ # 测试数据大小的最大值,单位:Byte + -d fp32 \ # 测试数据的数据类型 + -o sum \ # Reduce 操作类型 + -p 4 \ # NPU 数量 + -w 100 \ # 预热迭代次数,不计入性能统计 + -n 500 # 迭代次数 +``` + +> 【注意】赛事工作组评测选手代码时会执行 10 次上述命令,取带宽的均值作为性能得分 ## 8. 提交代码 diff --git a/test/algorithm/testcase/testcase_all_reduce.cc b/test/algorithm/testcase/testcase_all_reduce.cc index 9f87d77..e3f10e3 100644 --- a/test/algorithm/testcase/testcase_all_reduce.cc +++ b/test/algorithm/testcase/testcase_all_reduce.cc @@ -1865,7 +1865,7 @@ TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_int8) gen.GenTopoMeta(topoMeta, 1, 1, 8); // 4g + int8 - u64 size = 4 * 1024 * 1024 * 1024; + u64 size = 4LLU * 1024 * 1024 * 1024; auto dataType = CheckerDataType::DATA_TYPE_INT8; CheckerOpParam checkerOpParam; checkerOpParam.opType = CheckerOpType::ALLREDUCE; @@ -1995,7 +1995,7 @@ TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp16) gen.GenTopoMeta(topoMeta, 1, 1, 8); // 4g + fp16 - u64 size = 4 * 1024 * 1024 * 1024; + u64 size = 4LLU * 1024 * 1024 * 1024; auto dataType = CheckerDataType::DATA_TYPE_FP16; CheckerOpParam checkerOpParam; checkerOpParam.opType = CheckerOpType::ALLREDUCE; @@ -2125,7 +2125,7 @@ TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp32) gen.GenTopoMeta(topoMeta, 1, 1, 8); // 4g + fp32 - u64 size = 4 * 1024 * 1024 * 1024; + u64 size = 4LLU * 1024 * 1024 * 1024; auto dataType = CheckerDataType::DATA_TYPE_FP32; CheckerOpParam checkerOpParam; checkerOpParam.opType = CheckerOpType::ALLREDUCE; -- Gitee From 56b95d255f4ae41ecc931d6798fdd4788c936045 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Thu, 24 Jul 2025 11:36:25 +0000 Subject: [PATCH 7/7] =?UTF-8?q?!75=20[HCCL=E7=AB=9E=E8=B5=9B]=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E8=AF=84=E6=B5=8B=E8=84=9A=E6=9C=AC=E3=80=81profiling?= =?UTF-8?q?=20=E6=96=87=E6=A1=A3=20*=20eval?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contest.md | 52 +++++++++++ eval.py | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 eval.py diff --git a/contest.md b/contest.md index 2179668..9a69ca4 100644 --- a/contest.md +++ b/contest.md @@ -151,6 +151,19 @@ cd /home/hccluser/cann-hccl/output ## 7. 测试代码 +> 【注意】选手可使用评测脚本进行验证: + +```bash +cd /home/hccluser/cann-hccl + +# 查看使用方法(脚本作用:解析测试工具输出的字符串) +python3 eval.py --help +# 执行算法分析器用例 +python3 eval.py --llt +# 执行 HCCLTest 工具用例(3 种数据量的用例各执行 10 次,每次执行间隔 5s) +python3 eval.py --hccltest -n 10 -i 5 +``` + ### 7.1 算法分析器验证 > 【注意】算法分析器能够在无昇腾 NPU 场景下离线测试算法逻辑,包括:死锁检测、资源校验、内存冲突校验等 @@ -209,6 +222,45 @@ mpirun -np 4 \ # MPI 进程数量 > 【注意】赛事工作组评测选手代码时会执行 10 次上述命令,取带宽的均值作为性能得分 +### 7.3 使用 Profiling 工具分析程序性能 + +> 【注意】开启 profiling 后性能会有所下降 + +1. 生成 profiling 数据 + +```bash +# 开启 Profiling 开关 +export HCCL_TEST_PROFILING=1 +export HCCL_TEST_PROFILING_PATH=/home/hccluser/prof + +# 执行 HCCLTest 用例 +# 会在 /home/hccluser/prof 目录下生成 4 个文件夹,对应每张 NPU 卡 +cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test +mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500 + +# 导出 Profiling 结果 +cd /home/hccluser/prof +msprof --export=on --output=./ + +# 把每张 NPU 的 Profiling 结果复制到 timeline 目录,包含 4 个 json 文件 +mkdir -p timeline +cp -i PROF*/mindstudio_profiler_output/msprof*.json timeline/ +``` + +2. 复制 profiling 结果到本地 + +在选手本地 PC 终端中使用 `scp` 命令将 profiling 结果复制到本地桌面: + +```bash +scp -P PORT hccluser@IP:/home/hccluser/prof/timeline/*.json ~/Desktop +``` + +3. 使用 Chrome 浏览器打开 profiling 结果 + +浏览器打开:`chrome://tracing`,将 json 文件拖拽到浏览器中,即可打开 + +使用方法:通过键盘上的快捷键(w:放大,s:缩小,a:左移,d:右移)进行查看 + ## 8. 提交代码 执行下列脚本,将选手代码拷贝到 `/result` 目录下 diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..5b85020 --- /dev/null +++ b/eval.py @@ -0,0 +1,264 @@ +import argparse +import subprocess +import csv +import time +import math +import os +import logging +import re + +from typing import List, Optional, Union, Dict, Tuple + + +# 日志 +logger = logging.getLogger("hccl_eval_logger") +logger.setLevel(logging.DEBUG) +# 日志文件打印 +file_fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") +file_handler = logging.FileHandler("hccl_contest_eval.log") +file_handler.setLevel(logging.DEBUG) +file_handler.setFormatter(file_fmt) +# 控制台打印 +console_fmt = logging.Formatter("%(message)s") +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) +console_handler.setFormatter(console_fmt) +logger.addHandler(file_handler) +logger.addHandler(console_handler) + +cmd_t = Union[List[str], str] + +ascend_home_path: str = os.getenv("ASCEND_HOME_PATH", default="") + + +def exec( + cmd: cmd_t, + /, + pwd: Optional[str] = None, + env: Optional[Dict[str, str]] = None, +) -> Tuple[int, str, str]: + """执行命令并获取输出""" + result = subprocess.run( + cmd, + cwd=pwd, + env=env, + shell=True, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + return result.returncode, result.stdout, result.stderr + + +class HcclTestResult: + data_size: int = 0 # 数据大小(Bytes) + aveg_time: float = 0.0 # 平均时间(us) + alg_bandwidth: float = 0.0 # 算法带宽(GB/s) + check_result: str = "failed" # 检查结果 + + @property + def headers(self) -> List[str]: + return [ + "data_size(Bytes)", + "aveg_time(us)", + "alg_bandwidth(GB/s)", + "check_result", + ] + + def __str__(self): + return f"alg_bandwidth: {self.alg_bandwidth}, check_result: {self.check_result}" + + @classmethod + def parse(cls, output: str): + """ + 解析 HCCLTest 输出结果 + + 结果正确输出样例: + + $ mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500 + the minbytes is 524288, maxbytes is 524288, iters is 500, warmup_iters is 100 + data_size(Bytes): | aveg_time(us): | alg_bandwidth(GB/s): | check_result: + 524288 | 102.29 | 5.12530 | success + + 结果错误输出样例: + + $ mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 -w 100 -n 500 + the minbytes is 67108864, maxbytes is 67108864, iters is 500, warmup_iters is 100 + check buf[14783552] error, exp:8.000000, act:6.000000 + total err is 192 + rank id 0, check result failed, 67108864 | 3665.90 | 18.30623 | failed + data_size(Bytes): | aveg_time(us): | alg_bandwidth(GB/s): | check_result: + 67108864 | 3665.90 | 18.30623 | failed + """ + + headers = [ + "data_size(Bytes)", + "aveg_time(us)", + "alg_bandwidth(GB/s)", + "check_result", + ] + + lines = output.splitlines() + test_rst = HcclTestResult() + + def parse_line(line: str) -> HcclTestResult: + parts = [p.strip() for p in line.split("|")] + try: + rst = HcclTestResult() + rst.data_size = int(parts[0]) + rst.aveg_time = float(parts[1]) + rst.alg_bandwidth = float(parts[2]) + rst.check_result = parts[3] + except (ValueError, IndexError) as e: + logger.error("Failed to parse: %s", line) + logger.exception("Error: %s", e) + raise e + return rst + + for idx, line in enumerate(lines): + # 标题行 + if all(header in line for header in headers): + # 解析标题行的下一行 + assert idx < len(line) + test_rst = parse_line(lines[idx + 1]) + + # 结果错误,带宽设为 0,不得分 + failed_pos = line.find("check result failed") + if failed_pos >= 0: + logger.debug("Check result failed") + # 解析错误行结果 + last_comma_pos = line.find(",", failed_pos) + test_rst = parse_line(line[last_comma_pos + 1 :]) + test_rst.alg_bandwidth = 0.0 + return test_rst + + return test_rst + + +def eval_hccl_test( + *, + npus: int = 4, + iters: int = 10, + interval: int = 5, +): + """ + 评测 HCCLTest + + 分别执行 3 种数据量 10 次,取带宽均值: + mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500 + mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 -w 100 -n 500 + mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 -w 100 -n 500 + """ + + data_sizes = ["512k", "2m", "64m"] + pwd = os.path.join(ascend_home_path, "tools", "hccl_test") + + # 3 种数据量 + for size in data_sizes: + cores = ",".join(str(2 * i) for i in range(npus)) + cmd = f"mpirun -np {npus} taskset -c {cores} ./bin/all_reduce_test -b {size} -e {size} -d fp32 -o sum -p {npus} -w 100 -n 500" + + # 跑 10 次测试 + results: List[HcclTestResult] = [] + for i in range(iters): + logger.debug("[%s][%d/%d] Evaluating with cmd: %s", size, i + 1, iters, cmd) + # 执行命令 + _, output, _ = exec(cmd, pwd=pwd) + logger.debug("[%s][%d/%d] Output:\n%s", size, i + 1, iters, output) + # 解析输出 + rst = HcclTestResult.parse(output) + results.append(rst) + logger.info("[%s][%d/%d] %s", size, i + 1, iters, rst) + + if i < iters - 1 and interval > 0: + time.sleep(interval) + + total_bw = math.fsum(rst.alg_bandwidth for rst in results) + aveg_bw = total_bw / iters + logger.warning("Data size: %s, average bandwidth: %f(GB/s)", size, aveg_bw) + + +def eval_gtest(): + """ + 评测算法分析器用例,执行 5 种数据量、3 种数据类型共 15 个用例 + + 正确结果样例: + + [----------] 15 tests from AllReduceTest (503 ms total) + + [----------] Global test environment tear-down + [==========] 15 tests from 1 test suite ran. (503 ms total) + [ PASSED ] 15 tests. + + 错误结果样例: + + [----------] 15 tests from AllReduceTest (233 ms total) + + [----------] Global test environment tear-down + [==========] 15 tests from 1 test suite ran. (234 ms total) + [ PASSED ] 14 tests. + [ FAILED ] 1 tests, listed below: + [ FAILED ] AllReduceTest.allreduce_contest_test_910b_512k_int8 + + 1 FAILED TESTS + """ + ld_library_path = os.getenv("LD_LIBRARY_PATH", "") + build_test_path = f"/home/hccluser/cann-hccl/build/test" + env = None + if build_test_path not in ld_library_path: + env = {"LD_LIBRARY_PATH": f"{build_test_path}:{ld_library_path}"} + + cmd = "./open_hccl_test" + logger.debug("Evaluating with cmd: %s", cmd) + _, output, _ = exec(cmd, env=env, pwd=build_test_path) + logger.debug("Output:\n%s", output) + + # 通过数量 + passed_match = re.search(r"\[ PASSED \] (\d+) tests?\.", output) + passed_count = int(passed_match.group(1)) if passed_match else 0 + + # 失败数量 + failed_match = re.search(r"\[ FAILED \] (\d+) tests?", output) + failed_count = int(failed_match.group(1)) if failed_match else 0 + + # 失败用例列表 + failed_tests = [] + if failed_count > 0: + failed_tests = set(re.findall(r"\[ FAILED \] (\w+\.\w+)", output)) + + logger.info("[ PASSED ] %d tests.", passed_count) + if failed_count > 0: + logger.info("[ FAILED ] %d tests, listed below:", failed_count) + for failed_test in failed_tests: + logger.info("[ FAILED ] %s", failed_test) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Evaluation tool") + parser.add_argument("--llt", action="store_true", help="LLT tests") + parser.add_argument("--hccltest", action="store_true", help="HCCLTest tests") + parser.add_argument("-p", "--npus", type=int, default=4, help="HCCLTest tests - NPU count") + parser.add_argument("-n", "--iters", type=int, default=10, help="HCCLTest tests - iterations") + parser.add_argument("-i", "--interval", type=int, default=5, help="HCCLTest tests - interval") + return parser.parse_args() + + +def main(): + args = parse_args() + + if args.hccltest: + logger.info("Evaluating by HcclTest") + eval_hccl_test( + npus=args.npus, + iters=args.iters, + interval=args.interval, + ) + + if args.llt: + logger.info("Evaluating LLT tests") + eval_gtest() + + +if __name__ == "__main__": + main() -- Gitee