diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4d70f070b7a9bbcc528b2ad811d83412765de22a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+.vscode
+.idea
+build
+output
+__pycache__
+*.log
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c45e7ae54859247e2cfd1b5fe90bfa8f0df662d1..e1fe63f618e1a608d63220f1ed14c4f66ab15a26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,8 @@ project(hccl)
 
 option(BUILD_OPEN_PROJECT "Build open hccl project." ON)
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 if(BUILD_OPEN_PROJECT)
     include(cmake/config.cmake)
     add_subdirectory(src/domain/collective_communication)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f853e97d7b90c9468c71bd48c99c1950b7821892
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,141 @@
+# NOTE: Building this image requires docker version >= 18.0
+
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} AS official
+
+ARG TARGETPLATFORM=linux/arm64
+
+ENV USER_PASSWD=change_me
+
+SHELL [ "/bin/bash", "-c" ]
+
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.backup && \
+    case ${TARGETPLATFORM} in \
+         "linux/arm64")  sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list  ;; \
+         *)              sed -i 's|archive.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list  ;; \
+    esac
+
+RUN apt-get update \
+    && apt-get install --no-install-recommends -y \
+        apt-transport-https \
+        ca-certificates \
+        build-essential \
+        bash \
+        curl \
+        git \
+        wget \
+        gcc \
+        g++ \
+        make \
+        cmake \
+        python3 \
+        python3-pip \
+        gdb \
+        vim \
+        file \
+        man \
+        sudo \
+        zlib1g \
+        openssl \
+        unzip \
+        pciutils \
+        net-tools \
+        gfortran \
+        patchelf \
+        libblas3 \
+        libblas-dev \
+        libssl-dev \
+        zlib1g-dev \
+        libncurses5-dev \
+        libbz2-dev \
+        libreadline-dev \
+        libsqlite3-dev \
+        libffi-dev \
+        libnss3-dev \
+        libgdbm-dev \
+        liblzma-dev \
+        libev-dev \
+        openssh-server \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /var/tmp/* \
+    && rm -rf /tmp/*
+
+# 创建 hccl 用户
+RUN groupadd -g 1000 hcclgroup && \
+    useradd -u 1000 -g hcclgroup -ms /bin/bash hccluser && \
+    usermod -aG sudo hccluser && \
+    echo "hccluser ALL=(ALL) NOPASSWD:/usr/bin/apt-get,/usr/bin/apt" >> /etc/sudoers
+
+USER hccluser 
+
+RUN pip install --no-cache-dir -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \
+        attrs cython numpy==1.24.0 decorator sympy cffi pyyaml pathlib2 \
+        psutil protobuf==3.20 scipy requests absl-py
+
+# 安装 CANN 8.2.RC1.alpha003
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  ARCH=aarch64  ;; \
+         *)              ARCH=x86_64   ;; \
+    esac && \
+    CANN_TOOLKIT_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-toolkit_8.2.RC1.alpha003_linux-${ARCH}.run" && \
+    curl -fsSL -o /tmp/Ascend-cann-toolkit.run -O "${CANN_TOOLKIT_URL}" && \
+    CANN_COMMUNITY_SDK_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-communitysdk_8.2.RC1.alpha003_linux-${ARCH}.run" && \
+    curl -fsSL -o /tmp/Ascend-cann-communitysdk.run -O "${CANN_COMMUNITY_SDK_URL}"
+
+RUN chmod +x /tmp/Ascend-cann-toolkit.run && \
+    /tmp/Ascend-cann-toolkit.run --quiet --install && \
+    rm /tmp/Ascend-cann-toolkit.run
+
+RUN chmod +x /tmp/Ascend-cann-communitysdk.run && \
+    /tmp/Ascend-cann-communitysdk.run --quiet --full && \
+    rm /tmp/Ascend-cann-communitysdk.run
+
+# 安装 HCCL 依赖
+RUN curl -fsSL -o /tmp/include.zip -O https://github.com/nlohmann/json/releases/download/v3.11.2/include.zip && \
+    unzip -d ${HOME}/nlohmann_json /tmp/include.zip && \
+    rm /tmp/include.zip
+
+RUN curl -fsSL -o /tmp/mpich.tar.gz -O https://www.mpich.org/static/downloads/3.2.1/mpich-3.2.1.tar.gz && \
+    tar -zxf /tmp/mpich.tar.gz -C /tmp && \
+    cd /tmp/mpich-3.2.1 && \
+    ./configure --disable-fortran --prefix=${HOME}/mpich --with-device=ch3:nemesis && \
+    make && make install && \
+    rm -r /tmp/mpich-3.2.1 && \
+    rm /tmp/mpich.tar.gz
+
+# 设置环境变量
+RUN \
+    # NPU 驱动环境变量
+    echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc && \
+    echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc && \
+    # CANN Toolkit 环境变量
+    echo 'source ${HOME}/Ascend/ascend-toolkit/set_env.sh' >> ${HOME}/.bashrc && \
+    # MPICH 环境变量
+    echo 'export PATH=${HOME}/mpich/bin:${PATH}' >> ${HOME}/.bashrc && \
+    echo 'export LD_LIBRARY_PATH=${HOME}/mpich/lib:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc
+
+USER root
+
+# SSH 配置
+RUN echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \
+    echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \
+    echo "PermitUserEnvironment yes" >> /etc/ssh/sshd_config && \
+    echo "ClientAliveInterval 60" >> /etc/ssh/sshd_config && \
+    echo "ClientAliveCountMax 3" >> /etc/ssh/sshd_config && \
+    echo "AllowUsers hccluser" >> /etc/ssh/sshd_config
+
+# SSH 启动脚本
+RUN echo '#!/bin/bash' > /start.sh && \
+    echo 'if [ -n "${USER_PASSWD}" ]; then' >> /start.sh && \
+    echo '  echo "hccluser:${USER_PASSWD}" | chpasswd' >> /start.sh && \
+    echo 'fi' >> /start.sh && \
+    echo 'mkdir -p /var/run/sshd' >> /start.sh && \
+    echo 'ssh-keygen -A' >> /start.sh && \
+    echo '/usr/sbin/sshd -D -e' >> /start.sh && \
+    chmod +x /start.sh
+
+EXPOSE 22
+
+CMD [ "/start.sh" ]
diff --git a/README.md b/README.md
index 6f5894402c8cb296cf9c92b00523b27f93454b52..04bb336e9e2ec9dd42354e2eb6df11f21ddbbc98 100644
--- a/README.md
+++ b/README.md
@@ -232,7 +232,7 @@ HCCL软件包安装完成后，开发者可通过HCCL Test工具进行集合通
 
 ## 相关文档
 
-HCCL提供了使用指南、环境变量参考、基于本源码仓进行定制的开发指南、算法分析工具使用指导等，详细可参见[HCCL资料书架总览](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88)。
+HCCL提供了用户指南、环境变量参考、基于源码仓进行算法与算子定制的开发指南等，详细可参见[HCCL资料书架总览](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88)。
 
 ## 贡献指南
 
diff --git a/contest.md b/contest.md
new file mode 100644
index 0000000000000000000000000000000000000000..3135574fcf199f3e028f91fe4d4a5dd6c48d1b06
--- /dev/null
+++ b/contest.md
@@ -0,0 +1,364 @@
+# HCCL 通信库创新大赛操作指导
+
+## 0. 赛前须知
+
+### 0.0 决赛题目
+
+![image](img/final_round_question.jpg)
+
+- 由于0-1卡间已断链，则无需返回此条链路间的建链请求
+
+### 0.1 技能要求
+
+1. 熟悉 C++14 编程语言
+2. 了解 GDB、LLDB 等调试工具
+3. 了解 VSCode、CLion 等 IDE 开发工具
+4. 了解 AllReduce 等集合通信原语
+
+### 0.2 资料
+
+HCCL 资料：
+
+- [昇腾社区官网][1]
+- [HCCL主页——昇腾社区][2]
+- [HCCL概述——昇腾社区][3]
+- [集合通信原语——昇腾社区][4]
+- [HCCL代码仓][5]
+- [HCCL Wiki][6]
+
+定制算法开发指南：
+
+1. [HCCL源码定制开发指南][7]
+2. [AllGather 定制算法实现][8]
+3. [HCCL 通信库创新大赛参赛 FAQ](./faq.md)
+
+[1]: https://www.hiascend.com
+[2]: https://www.hiascend.com/hccl
+[3]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/hccl/hcclug/hcclug_000001.html
+[4]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/hccl/hcclug/hcclug_000004.html
+[5]: https://gitee.com/ascend/cann-hccl
+[6]: https://gitee.com/ascend/cann-hccl/wikis
+[7]: https://gitee.com/ascend/cann-hccl/blob/master/docs/hccl_customized_dev/README.md
+[8]: https://gitee.com/ascend/cann-hccl/pulls/64
+
+### 0.3 评分标准
+
+组委会将从功能、性能、代码风格 3 个维度对参赛代码进行综合评测，评测公式：
+
+- 15 分功能分：15 个算法分析器用例，每个 1 分，通过得 1 分，不通过得 0 分
+
+    > 5 种数据量：1k/1m/64m/1g/4g，3 种数据格式：int8/fp16/fp32
+
+- 75 分性能分：3 个 HCCLTest 用例，每个 25 分，不通过得 0 分，通过则按照性能计分，性能最佳得满分，按照排名依次递减
+
+    > 3 种数据量：1k/1m/1g，1 种数据格式：fp32
+    >
+    > 性能标准：基于 HCCLTest 工具测试的带宽使用量（字段：`alg_bandwidth(GB/s)`）作为评判标准，数值越高越好
+
+- 10 分主观分：代码风格
+
+> 【注意】验证方法详见 [算法分析器验证](#71-算法分析器验证)、[HCCLTest工具验证](#72-hccltest-工具验证)
+
+## 1. 登录环境
+
+选手开发环境信息将通过邮件的方式发送至队长邮箱，队伍成员可通过 SSH 进入选手开发环境：
+
+```bash
+ssh root@ip -p port
+```
+
+## 2. 环境目录
+
+选手开发环境是运行在物理机上的 Docker 容器，目录结构如下：
+
+```
+|-- /dev
+|   |-- davinci0                      # NPU1
+|   |-- davinci1                      # NPU2
+|   |-- davinci2                      # NPU3
+|   `-- davinci3                      # NPU4
+|   |-- davinci4                      # NPU5
+|   |-- davinci5                      # NPU6
+|   |-- davinci6                      # NPU7
+|   `-- davinci7                      # NPU8
+|-- /usr/local/Ascend
+|   `-- driver                        # NPU 驱动安装目录
+|-- /home/hccluser/Ascend
+|   |-- ascend-toolkit                # CANN Toolkit 安装目录
+|   `-- ascend_cann_install.info      # CANN 安装信息
+`-- /home/hccluser
+    |-- cann-hccl                     # HCCL 代码仓（选手需自行下载）
+    |-- mpich                         # MPICH 安装目录
+    `-- nlohmann_json                 # nlohmann json inclue 目录
+```
+
+## 3. 软件版本
+
+> 【注意】
+> 
+> 1. 选手开发环境中已安装下列软件依赖
+> 2. 最终评测环境的软件版本与选手开发环境一致
+
+- gcc 11.4.0
+- g++ 11.4.0
+- make 4.3
+- cmake 3.22.1
+- mpich 3.2.1
+- CANN Toolkit 8.2.RC1.alpha003
+- CANN Community SDK 8.2.RC1.alpha003
+
+## 4. 代码开发
+
+### 4.1 下载代码
+
+> 【注意】选手只需下载 [ascend/cann-hccl](https://gitee.com/ascend/cann-hccl.git) 代码仓即可，编译运行所需全部依赖已提前安装
+
+```bash
+cd /home/hccluser
+
+git clone https://gitee.com/ascend/cann-hccl.git -b r1.5.2
+```
+
+### 4.2 IDE 远程开发
+
+推荐选手基于 VSCode、CLion 等 IDE，通过 SSH 连接开发环境进行远程开发，参考文档：
+
+- [VSCode 使用 SSH 远程开发](https://code.visualstudio.com/docs/remote/ssh)
+- [CLion 使用 SSH 远程开发](https://www.jetbrains.com/help/clion/remote-development.html)
+
+### 4.3 定制算法开发
+
+在 HCCL 软件架构中，`Operator` 负责算法选择，`Exeutor` 负责算法编排。为简化流程，选手只需实现以下内容：
+
+1. [custom_all_reduce_operator.cc](src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc) 中编写算法选择逻辑
+2. [coll_custom_small_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc) 中编写小数据量（1K）场景的 AllReduce 算法
+3. [coll_custom_medium_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc) 中编写中等数据量（1M）场景的 AllReduce 算法
+4. [coll_custom_huge_all_reduce_mesh_executor.cc](src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc) 中编写大数据量（1G）场景的 AllReduce 算法
+
+> 【注意】上述代码文件中，选手需要实现的内容已在代码注释中标明
+
+## 5. 编译代码
+
+编译所需的依赖项均已安装，在 HCCL 代码仓执行编译即可：
+
+```bash
+cd /home/hccluser/cann-hccl
+
+bash build.sh --nlohmann_path /home/hccluser/nlohmann_json/include
+```
+
+## 6. 安装编译结果
+
+编译生成的 HCCL 软件包在 `/home/hccluser/cann-hccl/output` 目录下：
+
+```bash
+cd /home/hccluser/cann-hccl/output
+
+./CANN-hccl_alg-8.2.t12.0.b077-linux.aarch64.run
+```
+
+安装完成后，用户编译生成的 HCCL 软件包会替换已安装 CANN 开发套件包中的 HCCL 相关软件
+
+## 7. 测试代码
+
+> 【注意】选手可使用评测脚本进行验证：
+
+```bash
+cd /home/hccluser/cann-hccl
+
+# 查看使用方法（脚本作用：解析测试工具输出的字符串）
+python3 eval.py --help
+# 执行算法分析器用例
+python3 eval.py --llt
+# 执行 HCCLTest 工具用例（3 种数据量的用例各执行 10 次，每次执行间隔 5s）
+python3 eval.py --hccltest -n 10 -i 5
+```
+
+### 7.1 算法分析器验证
+
+> 【注意】算法分析器能够在无昇腾 NPU 场景下离线测试算法逻辑，包括：死锁检测、资源校验、内存冲突校验等
+
+编译并执行算法分析器用例：
+
+```bash
+cd /home/hccluser/cann-hccl
+
+# 编译测试用例，并自动执行
+bash build.sh --nlohmann_path /home/hccluser/nlohmann_json/include --test --open_hccl_test
+
+# 手动执行测试用例
+export BUILD_TEST_DIR="/home/hccluser/cann-hccl/build/test/"
+export LD_LIBRARY_PATH="${BUILD_TEST_DIR}:${LD_LIBRARY_PATH}"
+./build/test/open_hccl_test
+```
+
+### 7.2 HCCLTest 工具验证
+
+> 【注意】性能测试场景可使用 HCCL Test 工具进行验证，该工具基于真实 NPU 设备进行功能和性能测试
+
+基于 HCCL Test 工具在 NPU 设备上执行验证：
+
+```bash
+cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test
+
+# 编译 HCCL 性能测试工具
+make MPI_HOME=/home/hccluser/mpich ASCEND_DIR=/home/hccluser/Ascend/ascend-toolkit/latest
+
+# 执行 HCCL Test
+# 1K
+mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 1k -e 1k -d fp32 -o sum -p 4 -w 100 -n 500
+# 1M
+mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 1m -e 1m -d fp32 -o sum -p 4 -w 100 -n 500
+# 1G
+mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 1g -e 1g -d fp32 -o sum -p 4 -w 100 -n 500
+```
+
+各参数解释如下，详细说明可参考：[昇腾文档中心-HCCL 性能测试工具使用指南][9]
+
+[9]: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha003/devaids/hccltool/HCCLpertest_16_0001.html
+
+```bash
+mpirun -np 8 \                          # MPI 进程数量
+    taskset -c 0,2,4,6,8,10,12,14\                # 将 MPI 进程绑定到 0,2,4,6,8,10,12,14 CPU 核（设置 CPU 亲和性，避免操作系统调度干扰，降低波动）
+    ./bin/all_reduce_test \             # 可执行文件路径
+    -b 1k \                           # 测试数据大小的最小值，单位：Byte
+    -e 1k \                           # 测试数据大小的最大值，单位：Byte
+    -d fp32 \                           # 测试数据的数据类型
+    -o sum \                            # Reduce 操作类型
+    -p 8 \                              # NPU 数量
+    -w 100 \                            # 预热迭代次数，不计入性能统计
+    -n 500                              # 迭代次数
+```
+
+> 【注意】赛事工作组评测选手代码时会执行 10 次上述命令，取带宽的均值作为性能得分
+
+### 7.3 使用 Profiling 工具分析程序性能
+
+> 【注意】开启 profiling 后性能会有所下降
+
+1. 生成 profiling 数据
+
+```bash
+# 开启 Profiling 开关
+export HCCL_TEST_PROFILING=1
+export HCCL_TEST_PROFILING_PATH=/home/hccluser/prof
+
+# 执行 HCCLTest 用例
+# 会在 /home/hccluser/prof 目录下生成 4 个文件夹，对应每张 NPU 卡
+cd /home/hccluser/Ascend/ascend-toolkit/latest/tools/hccl_test
+mpirun -np 8 taskset -c 0,2,4,6,8,10,12,14 ./bin/all_reduce_test -b 1k -e 1k -d fp32 -o sum -p 8 -w 100 -n 500
+
+# 导出 Profiling 结果
+cd /home/hccluser/prof
+msprof --export=on --output=./
+
+# 把每张 NPU 的 Profiling 结果复制到 timeline 目录，包含 8 个 json 文件
+mkdir -p timeline
+cp -i PROF*/mindstudio_profiler_output/msprof*.json timeline/
+```
+
+2. 复制 profiling 结果到本地
+
+在选手本地 PC 终端中使用 `scp` 命令将 profiling 结果复制到本地桌面：
+
+```bash
+scp -P PORT hccluser@IP:/home/hccluser/prof/timeline/*.json ~/Desktop
+```
+
+3. 使用 Chrome 浏览器打开 profiling 结果
+
+浏览器打开：`chrome://tracing`，将 json 文件拖拽到浏览器中，即可打开
+
+使用方法：通过键盘上的快捷键（w：放大，s：缩小，a：左移，d：右移）进行查看
+
+## 8. 提交代码
+
+执行下列脚本，将选手代码拷贝到 `/result` 目录下
+
+```bash
+cd /home/hccluser/cann-hccl
+
+bash submit.sh
+```
+
+该脚本将选手编写的定制算法文件拷贝至 `/result` 目录下，用于后续评测：
+
+1. `custom_all_reduce_operator.h`
+2. `custom_all_reduce_operator.cc`
+3. `coll_custom_small_all_reduce_mesh_executor.h`
+4. `coll_custom_small_all_reduce_mesh_executor.cc`
+5. `coll_custom_medium_all_reduce_mesh_executor.h`
+6. `coll_custom_medium_all_reduce_mesh_executor.cc`
+7. `coll_custom_huge_all_reduce_mesh_executor.h`
+8. `coll_custom_huge_all_reduce_mesh_executor.cc`
+
+## 9. 结果公布
+
+赛程结束后统一公布成绩
+
+> 【注意】选手开发环境与最终评测环境完全一致
+
+## 10. 调试代码
+
+### 10.1 日志
+
+#### 10.1.1 日志打印
+
+选手可通过调用日志宏保存日志到文件中，便于调试：
+
+```c++
+HCCL_DEBUG("[HCCL_CONTEST] Orchestrate start");
+HCCL_INFO("[HCCL_CONTEST] Total count: %u", totalCount);
+HCCL_WARNING("[HCCL_CONTEST] Cost: %u ms", cost);
+```
+
+#### 10.1.2 日志设置
+
+1. 日志级别
+
+HCCL 日志级别默认为 Error，下面通过环境变量设置为 Info 级别：
+
+```bash
+export ASCEND_GLOBAL_LOG_LEVEL=1    # 0: debug, 1: info, 2: warn, 3: error
+```
+
+2. 日志目录
+
+设置日志存储目录：
+
+```bash
+export ASCEND_PROCESS_LOG_PATH=/home/hccluser/log  # 默认为：$HOME/ascend/log
+```
+
+设置日志输出到控制台：
+
+```bash
+export ASCEND_SLOG_PRINT_TO_STDOUT=1
+```
+
+3. 日志数量
+
+设置每个进程最多保留的日志数量为较大数字，以防丢失：
+
+```bash
+export ASCEND_HOST_LOG_FILE_NUM=1000
+```
+
+### 10.2 Core dump 问题
+
+使用 gdb 调试：
+
+> 【注意】编译算法分析器依赖的 HCCL 代码时默认已开启 `-O0 -g` 编译选项
+
+```bash
+cd /home/hccluser/cann-hccl
+
+# 基于算法分析器调试 HCCL 定制算法
+export BUILD_TEST_DIR="/home/hccluser/cann-hccl/build/test/"
+export LD_LIBRARY_PATH="${BUILD_TEST_DIR}:${LD_LIBRARY_PATH}"
+gdb --args ./build/test/open_hccl_test
+```
+
+### 10.3 Wrong answer 问题
+
+请选手仔细排查定制算法是否符合 AllReduce 算法逻辑
diff --git a/eval.py b/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b85020b986e6e1624a5916caadf597c14061d3a
--- /dev/null
+++ b/eval.py
@@ -0,0 +1,264 @@
+import argparse
+import subprocess
+import csv
+import time
+import math
+import os
+import logging
+import re
+
+from typing import List, Optional, Union, Dict, Tuple
+
+
+# 日志
+logger = logging.getLogger("hccl_eval_logger")
+logger.setLevel(logging.DEBUG)
+# 日志文件打印
+file_fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+file_handler = logging.FileHandler("hccl_contest_eval.log")
+file_handler.setLevel(logging.DEBUG)
+file_handler.setFormatter(file_fmt)
+# 控制台打印
+console_fmt = logging.Formatter("%(message)s")
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+console_handler.setFormatter(console_fmt)
+logger.addHandler(file_handler)
+logger.addHandler(console_handler)
+
+cmd_t = Union[List[str], str]
+
+ascend_home_path: str = os.getenv("ASCEND_HOME_PATH", default="")
+
+
+def exec(
+    cmd: cmd_t,
+    /,
+    pwd: Optional[str] = None,
+    env: Optional[Dict[str, str]] = None,
+) -> Tuple[int, str, str]:
+    """执行命令并获取输出"""
+    result = subprocess.run(
+        cmd,
+        cwd=pwd,
+        env=env,
+        shell=True,
+        check=False,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    return result.returncode, result.stdout, result.stderr
+
+
+class HcclTestResult:
+    data_size: int = 0  # 数据大小(Bytes)
+    aveg_time: float = 0.0  # 平均时间(us)
+    alg_bandwidth: float = 0.0  # 算法带宽(GB/s)
+    check_result: str = "failed"  # 检查结果
+
+    @property
+    def headers(self) -> List[str]:
+        return [
+            "data_size(Bytes)",
+            "aveg_time(us)",
+            "alg_bandwidth(GB/s)",
+            "check_result",
+        ]
+
+    def __str__(self):
+        return f"alg_bandwidth: {self.alg_bandwidth}, check_result: {self.check_result}"
+
+    @classmethod
+    def parse(cls, output: str):
+        """
+        解析 HCCLTest 输出结果
+
+        结果正确输出样例：
+
+        $ mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500
+        the minbytes is 524288, maxbytes is 524288, iters is 500, warmup_iters is 100
+        data_size(Bytes): | aveg_time(us): | alg_bandwidth(GB/s): | check_result:
+        524288            | 102.29         | 5.12530              | success
+
+        结果错误输出样例：
+
+        $ mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 -w 100 -n 500
+        the minbytes is 67108864, maxbytes is 67108864, iters is 500, warmup_iters is 100
+        check buf[14783552] error, exp:8.000000, act:6.000000
+        total err is 192
+        rank id 0, check result failed, 67108864          | 3665.90        | 18.30623             | failed
+        data_size(Bytes): | aveg_time(us): | alg_bandwidth(GB/s): | check_result:
+        67108864          | 3665.90        | 18.30623             | failed
+        """
+
+        headers = [
+            "data_size(Bytes)",
+            "aveg_time(us)",
+            "alg_bandwidth(GB/s)",
+            "check_result",
+        ]
+
+        lines = output.splitlines()
+        test_rst = HcclTestResult()
+
+        def parse_line(line: str) -> HcclTestResult:
+            parts = [p.strip() for p in line.split("|")]
+            try:
+                rst = HcclTestResult()
+                rst.data_size = int(parts[0])
+                rst.aveg_time = float(parts[1])
+                rst.alg_bandwidth = float(parts[2])
+                rst.check_result = parts[3]
+            except (ValueError, IndexError) as e:
+                logger.error("Failed to parse: %s", line)
+                logger.exception("Error: %s", e)
+                raise e
+            return rst
+
+        for idx, line in enumerate(lines):
+            # 标题行
+            if all(header in line for header in headers):
+                # 解析标题行的下一行
+                assert idx < len(line)
+                test_rst = parse_line(lines[idx + 1])
+
+            # 结果错误，带宽设为 0，不得分
+            failed_pos = line.find("check result failed")
+            if failed_pos >= 0:
+                logger.debug("Check result failed")
+                # 解析错误行结果
+                last_comma_pos = line.find(",", failed_pos)
+                test_rst = parse_line(line[last_comma_pos + 1 :])
+                test_rst.alg_bandwidth = 0.0
+                return test_rst
+
+        return test_rst
+
+
+def eval_hccl_test(
+    *,
+    npus: int = 4,
+    iters: int = 10,
+    interval: int = 5,
+):
+    """
+    评测 HCCLTest
+
+    分别执行 3 种数据量 10 次，取带宽均值：
+    mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 512k -e 512k -d fp32 -o sum -p 4 -w 100 -n 500
+    mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 2m -e 2m -d fp32 -o sum -p 4 -w 100 -n 500
+    mpirun -np 4 taskset -c 0,2,4,6 ./bin/all_reduce_test -b 64m -e 64m -d fp32 -o sum -p 4 -w 100 -n 500
+    """
+
+    data_sizes = ["512k", "2m", "64m"]
+    pwd = os.path.join(ascend_home_path, "tools", "hccl_test")
+
+    # 3 种数据量
+    for size in data_sizes:
+        cores = ",".join(str(2 * i) for i in range(npus))
+        cmd = f"mpirun -np {npus} taskset -c {cores} ./bin/all_reduce_test -b {size} -e {size} -d fp32 -o sum -p {npus} -w 100 -n 500"
+
+        # 跑 10 次测试
+        results: List[HcclTestResult] = []
+        for i in range(iters):
+            logger.debug("[%s][%d/%d] Evaluating with cmd: %s", size, i + 1, iters, cmd)
+            # 执行命令
+            _, output, _ = exec(cmd, pwd=pwd)
+            logger.debug("[%s][%d/%d] Output:\n%s", size, i + 1, iters, output)
+            # 解析输出
+            rst = HcclTestResult.parse(output)
+            results.append(rst)
+            logger.info("[%s][%d/%d] %s", size, i + 1, iters, rst)
+
+            if i < iters - 1 and interval > 0:
+                time.sleep(interval)
+
+        total_bw = math.fsum(rst.alg_bandwidth for rst in results)
+        aveg_bw = total_bw / iters
+        logger.warning("Data size: %s, average bandwidth: %f(GB/s)", size, aveg_bw)
+
+
+def eval_gtest():
+    """
+    评测算法分析器用例，执行 5 种数据量、3 种数据类型共 15 个用例
+
+    正确结果样例：
+
+    [----------] 15 tests from AllReduceTest (503 ms total)
+
+    [----------] Global test environment tear-down
+    [==========] 15 tests from 1 test suite ran. (503 ms total)
+    [  PASSED  ] 15 tests.
+
+    错误结果样例：
+
+    [----------] 15 tests from AllReduceTest (233 ms total)
+
+    [----------] Global test environment tear-down
+    [==========] 15 tests from 1 test suite ran. (234 ms total)
+    [  PASSED  ] 14 tests.
+    [  FAILED  ] 1 tests, listed below:
+    [  FAILED  ] AllReduceTest.allreduce_contest_test_910b_512k_int8
+
+    1 FAILED TESTS
+    """
+    ld_library_path = os.getenv("LD_LIBRARY_PATH", "")
+    build_test_path = f"/home/hccluser/cann-hccl/build/test"
+    env = None
+    if build_test_path not in ld_library_path:
+        env = {"LD_LIBRARY_PATH": f"{build_test_path}:{ld_library_path}"}
+
+    cmd = "./open_hccl_test"
+    logger.debug("Evaluating with cmd: %s", cmd)
+    _, output, _ = exec(cmd, env=env, pwd=build_test_path)
+    logger.debug("Output:\n%s", output)
+
+    # 通过数量
+    passed_match = re.search(r"\[  PASSED  \] (\d+) tests?\.", output)
+    passed_count = int(passed_match.group(1)) if passed_match else 0
+
+    # 失败数量
+    failed_match = re.search(r"\[  FAILED  \] (\d+) tests?", output)
+    failed_count = int(failed_match.group(1)) if failed_match else 0
+
+    # 失败用例列表
+    failed_tests = []
+    if failed_count > 0:
+        failed_tests = set(re.findall(r"\[  FAILED  \] (\w+\.\w+)", output))
+
+    logger.info("[  PASSED  ] %d tests.", passed_count)
+    if failed_count > 0:
+        logger.info("[  FAILED  ] %d tests, listed below:", failed_count)
+    for failed_test in failed_tests:
+        logger.info("[  FAILED  ] %s", failed_test)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Evaluation tool")
+    parser.add_argument("--llt", action="store_true", help="LLT tests")
+    parser.add_argument("--hccltest", action="store_true", help="HCCLTest tests")
+    parser.add_argument("-p", "--npus", type=int, default=4, help="HCCLTest tests - NPU count")
+    parser.add_argument("-n", "--iters", type=int, default=10, help="HCCLTest tests - iterations")
+    parser.add_argument("-i", "--interval", type=int, default=5, help="HCCLTest tests - interval")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.hccltest:
+        logger.info("Evaluating by HcclTest")
+        eval_hccl_test(
+            npus=args.npus,
+            iters=args.iters,
+            interval=args.interval,
+        )
+
+    if args.llt:
+        logger.info("Evaluating LLT tests")
+        eval_gtest()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/faq.md b/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..f80fdef4e87ef4b0c2b0b35d1e0ba803921da47b
--- /dev/null
+++ b/faq.md
@@ -0,0 +1,118 @@
+# 一、开发环境
+
+#### 1. 在我的开发环境中还需要自行安装工具包吗？
+
+选手环境已提前安装所有软件依赖，选手只需下载 cann-hccl 源码即可，下载方法详见 [参赛指导](./contest.md)。
+
+#### 2. 我的开发环境是否有可能挂掉？代码会丢失吗？
+
+有可能，但概率非常小。开发环境遇到任何问题请及时寻求赛事工作组人员帮助。
+
+#### 3. 开发环境中的 NPU 设备会存在多支队伍共用导致资源竞争吗？
+
+不会，每个队伍的开发环境中的 NPU 设备都是独占的，不会存在冲突。
+
+# 二、算法开发
+
+#### 1. execMem结构体和param结构体中都有count、inputPtr、ouputPtr变量，有什么区别？
+
+* param中保存的是本次调用算子的数据，count是本次调用算子在一个rank上总共要存放的数据数量，inputPtr和outputPtr是起始输入输出内存块的指针。
+* 由于CCL_Output buffer的大小有限，需要循环多次中转UserInput数据。每次循环的起始位置都做了一个CCL_Output大小的偏移量。所以execMem维护的是当前循环已经经过偏移的指针位置，count也是本次循环要搬运的数据数量。
+
+#### 2. 为什么execMem.outputPtr是已经偏移后的内存指针，在跨卡搬运远端CCL_Output至本地Output时计算目的内存地址还要再加一个偏移值?
+
+跨卡搬运远端CCL_Output至本地Output过程中计算目的内存地址公式：
+
+```c++
+dst = DeviceMem::create(execMem.outputPtr + dstRank * param.DataDes.count * unitSize, curSize);
+```
+
+* 因为execMem.outputPtr中已经加上的偏移是每个rank在output区域相对于自己上一次循环使用的地址的offset。（rank内偏移）
+* 而dstRank * param.DataDes.count * unitSize是rank之间在OutputPtr区域上的相对偏移。（rank间偏移）
+  
+![image](img/offset_calc.jpg) 
+
+【例】在算到rank 1在rank 0上的第二个Output指针时要先偏移Rank 0的Output 1 + Output 2 + Output 3，再偏移Rank 1的Output 1。
+
+#### 3. 为什么allgather mesh算法实现中不需要ccl_input buffer但在rank本地搬运建链时src内存类型却是ccl input？
+
+```c++
+CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0],
+                            TransportMemType::CCL_INPUT,
+                            TransportMemType::CCL_OUTPUT));
+```
+
+因为单算子模式时userinput mem就是CCL_Input buffer，图模式时这两个变量的值才有区别。
+
+#### 4. 在跨rank搬运数据，循环遍历除本端rank外所有远端rank时如何得到每个远端rank编号？
+
+`u32 dstRank = (level0CommInfo.localRank + round + 1) % level0CommInfo.localRankSize;`
+
+以单机8卡，localRank是rank0为例演示计算过程，如下表所示。round从0\~6（共7次循环），恰好覆盖除自身（0）以外的所有节点（1\~7）。
+
+| round |      计算过程       | dstRank |     含义      |
+| :---: | :-----------------: | :-----: | :-----------: |
+|   0   | (0 + 0 + 1) % 8 = 1 |    1    | 与节点 1 通信 |
+|   1   | (0 + 1 + 1) % 8 = 2 |    2    | 与节点 2 通信 |
+|   2   | (0 + 2 + 1) % 8 = 3 |    3    | 与节点 3 通信 |
+|   3   | (0 + 3 + 1) % 8 = 4 |    4    | 与节点 4 通信 |
+|   4   | (0 + 4 + 1) % 8 = 5 |    5    | 与节点 5 通信 |
+|   5   | (0 + 5 + 1) % 8 = 6 |    6    | 与节点 6 通信 |
+|   6   | (0 + 6 + 1) % 8 = 7 |    7    | 与节点 7 通信 |
+
+#### 5. 在跨卡传输数据时，从流上在传输前后要进行前同步和后同步，目的是什么？
+
+* 前同步：确保双方进入 “传输准备” 状态（避免一方已发送，另一方未就绪）。
+* 后同步：确保数据拷贝完成后，再执行后续操作（避免竞态条件）。
+
+#### 6. 为什么跨卡传输数据前后需要本卡的主从流都同步一次？
+
+* 传输前：主流要通知每个从流准备开始工作。每条从流要回复主流准备好了。
+* 传输后：每条从流要通知主流数据搬运结束。主流要恢复从流收到。
+
+#### 7. 实现allreduce executor时可以继承非reduce相关类吗？
+
+可以，按照自己的实现思路按需继承即可。
+
+#### 8. 由于在实现算法编排功能时会用到暂未开源的HCCL平台层接口，以下是比赛可能会用到的编排接口范围
+
+| 接口名称           |
+| ------------------ |
+| HcclD2DMemcpyAsync |
+| HcclReduceAsync    |
+| HcclReduceScatter  |
+| HcclAllGather      |
+| HcclReduce         |
+| HcclBroadcast      |
+
+接口详细信息请参考：
+[HCCL接口列表1](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/hcclapiref/hcclcpp_07_0001.html)
+[HCCL接口列表2](https://gitee.com/ascend/cann-hccl/blob/master/docs/hccl_customized_dev/HcclD2DMemcpyAsync.md)
+
+#### 9. 在实现allreduce mesh算法时若用到HcclReduceAsync方法需注意跨rank搬运时要使用sdma协议，rdma协议暂不支持。
+
+# 三、算法调试
+
+#### 1. 算法分析器在检查mesh结构下带reduce的算子是否有内存冲突时可能误报。
+
+解决方法：确认无内存冲突后手动关闭内存冲突校验功能：`checker.CloseRankMemCheck();`
+
+详情见：[集合通信源码定制开发指南](https://gitee.com/ascend/cann-hccl/wikis/HCCL%E8%B5%84%E6%96%99%E4%B9%A6%E6%9E%B6%E6%80%BB%E8%A7%88)
+
+#### 2. 使用算法分析器调试如何获得更多算法执行信息？怎么看？
+
+可以在校验时把算法执行时的Task序列打印功能打开：`checker.EnableTaskPrint();`
+
+检查以下字段是否符合预期：
+
+* srcSlice、dstSlice：src是要被搬运的数据在哪，dst是要把数据搬到哪。上图蓝色划线表示rank 0在把本卡UserInput buffer中的数据往本卡CCL_Output buffer搬运的两次循环，绿色划线表示rank 0在把本卡CCL_Output buffer往本卡Output buffer搬运的两次循环。循环两次是由于 UserInput buffer中的数据量大于CCL_Output buffer的大小。
+* BufferType：内存类型，比如UserInput/CCL_Output/Output buffer。
+* offset：
+  * 偏移，用于表示内存指针指向的变化。实际调试过程中出现内存越界、内存冲突原因是偏移计算错误的概率很大。
+  * 可以看出绿色划线的CCL_Output buffer的offset一直是0，这是由于用于中转的CCL_Output内存地址一直是固定的。而UserInput和Output  buffer的offset一直在随循环递增，每次增加的大小就是CCL_Output的大小。
+* size：内存块大小，最后一次循环的size是尾块数据。大小取决于UserInput是否能被CCL_Output整除。
+
+#### 3. AllReduce的算法实现注意输入输出的tensor shape要一致。
+
+假设rank 0数据为：[1, 2, 3, 4]，rank 1数据为：[5, 6, 7, 8]，
+则经过allreduce后两张卡上的数据都应该是：[6, 8, 10, 12]而非[36]。（注意看初赛题的说明图）
diff --git a/img/final_round_question.jpg b/img/final_round_question.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a6bc6582edd81b2a8cc20ac9b2f97aa1c0e81e0d
Binary files /dev/null and b/img/final_round_question.jpg differ
diff --git a/img/offset_calc.jpg b/img/offset_calc.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5d94577d844b358aeb4b5a5e606a1da5410c2180
Binary files /dev/null and b/img/offset_calc.jpg differ
diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt
index 0c48f81fd9c5f0f4a78af9386fdaea215614eb97..e8a1d66727c8817b18ba1e6d72b4334b89835a18 100644
--- a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt
+++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/CMakeLists.txt
@@ -24,7 +24,9 @@ set(src_list
     ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_order_preserved_executor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_aiv_deter_executor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/coll_all_reduce_aiv_deter_small_executor.cc
-
+    ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_small_all_reduce_mesh_executor.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_medium_all_reduce_mesh_executor.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/coll_custom_huge_all_reduce_mesh_executor.cc
 )
 
 target_sources(hccl_alg PRIVATE
diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7f0de8eb828db7b82854ef4912dec9e4831e7c3
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "coll_custom_huge_all_reduce_mesh_executor.h"
+
+namespace hccl {
+CollCustomHugeAllReduceMeshExecutor::CollCustomHugeAllReduceMeshExecutor(const HcclDispatcher dispatcher,
+                                                                         std::unique_ptr<TopoMatcher> &topoMatcher)
+    : CollCommExecutor(dispatcher, topoMatcher)
+{
+}
+
+HcclResult CollCustomHugeAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize)
+{
+    // 计算所需要申请的 Scratch 内存大小
+    // TODO: 选手可根据算法需要自行修改
+    scratchMemSize = 0U;
+    HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u",
+                 scratchMemSize);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomHugeAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum)
+{
+    // 计算所需要申请的 Stream 数量
+    // TODO: 选手可根据算法需要自行修改
+    u32 totalStreamNum = topoAttr_.deviceNumPerAggregation;
+    streamNum = totalStreamNum - 1U;
+    HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomHugeAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 &notifyNum)
+{
+    // 计算所需要申请的 Notify 数量
+    // TODO: 选手可根据算法需要自行修改
+    notifyNum = 2U * streamNum;
+    HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomHugeAllReduceMeshExecutor::CalcCommInfo(std::vector<LevelNSubCommTransport> &opTransport)
+{
+    // 计算通信域信息
+    // TODO: 选手可根据算法需要自行修改
+    HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcNotifyNum]");
+
+    // CCL_Input -> CCL_Output
+    TransportMemType inputType = TransportMemType::CCL_INPUT;
+    TransportMemType outputType = TransportMemType::CCL_OUTPUT;
+    // 建立 Mesh 链路
+    CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH);
+    // 构造一级通信域资源请求
+    // 最终将调用：CalcMeshTransportReq::CalcTransportRequest()
+    CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType));
+    return HCCL_SUCCESS;
+}
+
+u64 CollCustomHugeAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize)
+{
+    // 计算循环处理的迭代次数
+    // TODO: 选手可根据算法需要自行修改
+
+    u64 maxCountPerLoop = cclBuffSize / unitSize;
+    HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u",
+                 maxCountPerLoop);
+    return maxCountPerLoop;
+}
+
+HcclResult CollCustomHugeAllReduceMeshExecutor::Orchestrate(OpParam &param, AlgResourceResponse &algRes)
+{
+    // 算法编排总入口
+    // TODO: 选手可根据算法需要自行修改
+
+    HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count);
+    tag_ = param.tag;
+    algResResp_ = &algRes;
+
+    // User_Input 和 User_Output 指针
+    u8 *userInputPtr = static_cast<u8 *>(param.inputPtr);
+    u8 *userOutputPtr = static_cast<u8 *>(param.outputPtr);
+    CHK_PTR_NULL(userInputPtr);
+    CHK_PTR_NULL(userOutputPtr);
+
+    u32 unitSize = SIZE_TABLE[param.DataDes.dataType];
+    u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize);
+
+    // 循环处理数据
+    for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) {
+        curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft;
+        u64 curSize = curCount * unitSize; // curSize 为三种数据量：512K/2M/64M
+
+        // 构造本次循环所使用的内存信息
+        ExecMem execMem;
+        execMem.count = curCount;                         // 本次循环处理的数据量
+        execMem.inputPtr = userInputPtr + inputOffset;    // 本次循环使用的 User_Input 内存指针
+        execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针
+        execMem.inputMem = algRes.cclInputMem;            // 本端的 CCL_Input 内存
+        execMem.outputMem = algRes.cclOutputMem;          // 本端的 CCL_Output 内存
+        execMem.scratchMem = algRes.scratchMem;           // 本端的 Scratch 内存
+
+        // 处理本次循环
+        CHK_RET(KernelRun(param, execMem));
+
+        // 更新偏移量
+        countLeft -= curCount;
+        inputOffset = curSize;
+        outputOffset = curSize;
+    }
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomHugeAllReduceMeshExecutor::KernelRun(const OpParam &param, ExecMem &execMem)
+{
+    // 处理单次循环的数据
+    // TODO: 选手可根据算法需要自行修改
+
+    u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数
+    u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小，三种数据量：512K/2m/64m，单位：字节
+    hccl::Stream &masterStream = const_cast<hccl::Stream &>(param.stream); // 主流
+
+    // TODO: 流同步
+
+    CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1));
+    SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0);
+    HCCL_WARNING("[HCCLContest][CollCustomHugeAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u",
+                 level0CommInfo.localRank, level0CommInfo.localRankSize);
+
+    // TODO: 搬运数据
+
+    return HCCL_SUCCESS;
+}
+
+REGISTER_EXEC("CustomHugeAllReduceMeshExecutor", CustomHugeAllReduceMesh, CollCustomHugeAllReduceMeshExecutor);
+} // namespace hccl
diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..707a5d66739c699d543659bd15487553596a7da7
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_huge_all_reduce_mesh_executor.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef COLL_CUSTOM_HUGE_ALLREDUCE_MESH_EXECUTOR_H
+#define COLL_CUSTOM_HUGE_ALLREDUCE_MESH_EXECUTOR_H
+
+#include "coll_comm_executor.h"
+
+namespace hccl {
+class CollCustomHugeAllReduceMeshExecutor : public CollCommExecutor {
+public:
+    CollCustomHugeAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr<TopoMatcher> &topoMatcher);
+    ~CollCustomHugeAllReduceMeshExecutor() = default;
+
+private:
+    /* *************** 资源计算 *************** */
+    HcclResult CalcScratchMemSize(u64 &scratchMemSize) override;
+    HcclResult CalcStreamNum(u32 &streamNum) override;
+    HcclResult CalcNotifyNum(u32 streamNum, u32 &notifyNum) override;
+    HcclResult CalcCommInfo(std::vector<LevelNSubCommTransport> &opTransport) override;
+    u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize);
+
+    /* *************** 算法编排 *************** */
+    HcclResult Orchestrate(OpParam &param, AlgResourceResponse &algRes);
+    HcclResult KernelRun(const OpParam &param, ExecMem &execMem) override;
+};
+} // namespace hccl
+
+#endif
diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6647cfc91f95b43790688404c770a135be0ad646
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "coll_custom_medium_all_reduce_mesh_executor.h"
+
+namespace hccl {
+CollCustomMediumAllReduceMeshExecutor::CollCustomMediumAllReduceMeshExecutor(const HcclDispatcher dispatcher,
+                                                                             std::unique_ptr<TopoMatcher> &topoMatcher)
+    : CollCommExecutor(dispatcher, topoMatcher)
+{
+}
+
+HcclResult CollCustomMediumAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize)
+{
+    // 计算所需要申请的 Scratch 内存大小
+    // TODO: 选手可根据算法需要自行修改
+    scratchMemSize = 0U;
+    HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u",
+                 scratchMemSize);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomMediumAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum)
+{
+    // 计算所需要申请的 Stream 数量
+    // TODO: 选手可根据算法需要自行修改
+    u32 totalStreamNum = topoAttr_.deviceNumPerAggregation;
+    streamNum = totalStreamNum - 1U;
+    HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomMediumAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 &notifyNum)
+{
+    // 计算所需要申请的 Notify 数量
+    // TODO: 选手可根据算法需要自行修改
+    notifyNum = 2U * streamNum;
+    HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomMediumAllReduceMeshExecutor::CalcCommInfo(std::vector<LevelNSubCommTransport> &opTransport)
+{
+    // 计算通信域信息
+    // TODO: 选手可根据算法需要自行修改
+    HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcNotifyNum]");
+
+    // CCL_Input -> CCL_Output
+    TransportMemType inputType = TransportMemType::CCL_INPUT;
+    TransportMemType outputType = TransportMemType::CCL_OUTPUT;
+    // 建立 Mesh 链路
+    CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH);
+    // 构造一级通信域资源请求
+    // 最终将调用：CalcMeshTransportReq::CalcTransportRequest()
+    CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType));
+    return HCCL_SUCCESS;
+}
+
+u64 CollCustomMediumAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize)
+{
+    // 计算循环处理的迭代次数
+    // TODO: 选手可根据算法需要自行修改
+
+    u64 maxCountPerLoop = cclBuffSize / unitSize;
+    HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u",
+                 maxCountPerLoop);
+    return maxCountPerLoop;
+}
+
+HcclResult CollCustomMediumAllReduceMeshExecutor::Orchestrate(OpParam &param, AlgResourceResponse &algRes)
+{
+    // 算法编排总入口
+    // TODO: 选手可根据算法需要自行修改
+
+    HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count);
+    tag_ = param.tag;
+    algResResp_ = &algRes;
+
+    // User_Input 和 User_Output 指针
+    u8 *userInputPtr = static_cast<u8 *>(param.inputPtr);
+    u8 *userOutputPtr = static_cast<u8 *>(param.outputPtr);
+    CHK_PTR_NULL(userInputPtr);
+    CHK_PTR_NULL(userOutputPtr);
+
+    u32 unitSize = SIZE_TABLE[param.DataDes.dataType];
+    u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize);
+
+    // 循环处理数据
+    for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) {
+        curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft;
+        u64 curSize = curCount * unitSize; // curSize 为三种数据量：512K/2M/64M
+
+        // 构造本次循环所使用的内存信息
+        ExecMem execMem;
+        execMem.count = curCount;                         // 本次循环处理的数据量
+        execMem.inputPtr = userInputPtr + inputOffset;    // 本次循环使用的 User_Input 内存指针
+        execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针
+        execMem.inputMem = algRes.cclInputMem;            // 本端的 CCL_Input 内存
+        execMem.outputMem = algRes.cclOutputMem;          // 本端的 CCL_Output 内存
+        execMem.scratchMem = algRes.scratchMem;           // 本端的 Scratch 内存
+
+        // 处理本次循环
+        CHK_RET(KernelRun(param, execMem));
+
+        // 更新偏移量
+        countLeft -= curCount;
+        inputOffset = curSize;
+        outputOffset = curSize;
+    }
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomMediumAllReduceMeshExecutor::KernelRun(const OpParam &param, ExecMem &execMem)
+{
+    // 处理单次循环的数据
+    // TODO: 选手可根据算法需要自行修改
+
+    u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数
+    u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小，三种数据量：512K/2m/64m，单位：字节
+    hccl::Stream &masterStream = const_cast<hccl::Stream &>(param.stream); // 主流
+
+    // TODO: 流同步
+
+    CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1));
+    SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0);
+    HCCL_WARNING("[HCCLContest][CollCustomMediumAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u",
+                 level0CommInfo.localRank, level0CommInfo.localRankSize);
+
+    // TODO: 搬运数据
+
+    return HCCL_SUCCESS;
+}
+
+REGISTER_EXEC("CustomMediumAllReduceMeshExecutor", CustomMediumAllReduceMesh, CollCustomMediumAllReduceMeshExecutor);
+} // namespace hccl
diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfa4d4b328a589683d1547a02970f21a8e88af20
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_medium_all_reduce_mesh_executor.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef COLL_CUSTOM_MEDIUM_ALLREDUCE_MESH_EXECUTOR_H
+#define COLL_CUSTOM_MEDIUM_ALLREDUCE_MESH_EXECUTOR_H
+
+#include "coll_comm_executor.h"
+
+namespace hccl {
+class CollCustomMediumAllReduceMeshExecutor : public CollCommExecutor {
+public:
+    CollCustomMediumAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr<TopoMatcher> &topoMatcher);
+    ~CollCustomMediumAllReduceMeshExecutor() = default;
+
+private:
+    /* *************** 资源计算 *************** */
+    HcclResult CalcScratchMemSize(u64 &scratchMemSize) override;
+    HcclResult CalcStreamNum(u32 &streamNum) override;
+    HcclResult CalcNotifyNum(u32 streamNum, u32 &notifyNum) override;
+    HcclResult CalcCommInfo(std::vector<LevelNSubCommTransport> &opTransport) override;
+    u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize);
+
+    /* *************** 算法编排 *************** */
+    HcclResult Orchestrate(OpParam &param, AlgResourceResponse &algRes);
+    HcclResult KernelRun(const OpParam &param, ExecMem &execMem) override;
+};
+} // namespace hccl
+
+#endif
diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7aebc74774810dea0138225f493241a014b1d612
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "coll_custom_small_all_reduce_mesh_executor.h"
+
+namespace hccl {
+CollCustomSmallAllReduceMeshExecutor::CollCustomSmallAllReduceMeshExecutor(const HcclDispatcher dispatcher,
+                                                                           std::unique_ptr<TopoMatcher> &topoMatcher)
+    : CollCommExecutor(dispatcher, topoMatcher)
+{
+}
+
+HcclResult CollCustomSmallAllReduceMeshExecutor::CalcScratchMemSize(u64 &scratchMemSize)
+{
+    // 计算所需要申请的 Scratch 内存大小
+    // TODO: 选手可根据算法需要自行修改
+    scratchMemSize = 0U;
+    HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcScratchMemSize] scratchMemSize: %u",
+                 scratchMemSize);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomSmallAllReduceMeshExecutor::CalcStreamNum(u32 &streamNum)
+{
+    // 计算所需要申请的 Stream 数量
+    // TODO: 选手可根据算法需要自行修改
+    u32 totalStreamNum = topoAttr_.deviceNumPerAggregation;
+    streamNum = totalStreamNum - 1U;
+    HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcStreamNum] streamNum: %u", streamNum);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomSmallAllReduceMeshExecutor::CalcNotifyNum(u32 streamNum, u32 &notifyNum)
+{
+    // 计算所需要申请的 Notify 数量
+    // TODO: 选手可根据算法需要自行修改
+    notifyNum = 2U * streamNum;
+    HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcNotifyNum] notifyNum: %u", notifyNum);
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomSmallAllReduceMeshExecutor::CalcCommInfo(std::vector<LevelNSubCommTransport> &opTransport)
+{
+    // 计算通信域信息
+    // TODO: 选手可根据算法需要自行修改
+    HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcNotifyNum]");
+
+    // CCL_Input -> CCL_Output
+    TransportMemType inputType = TransportMemType::CCL_INPUT;
+    TransportMemType outputType = TransportMemType::CCL_OUTPUT;
+    // 建立 Mesh 链路
+    CommParaInfo commParaLevel0(COMM_LEVEL0, CommType::COMM_TAG_MESH);
+    // 构造一级通信域资源请求
+    // 最终将调用：CalcMeshTransportReq::CalcTransportRequest()
+    CHK_RET(CalcCommPlaneInfo(tag_, commParaLevel0, opTransport[COMM_LEVEL0], inputType, outputType));
+    return HCCL_SUCCESS;
+}
+
+u64 CollCustomSmallAllReduceMeshExecutor::CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize)
+{
+    // 计算循环处理的迭代次数
+    // TODO: 选手可根据算法需要自行修改
+
+    u64 maxCountPerLoop = cclBuffSize / unitSize;
+    HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][CalcLoopMaxCount] maxCountPerLoop: %u",
+                 maxCountPerLoop);
+    return maxCountPerLoop;
+}
+
+HcclResult CollCustomSmallAllReduceMeshExecutor::Orchestrate(OpParam &param, AlgResourceResponse &algRes)
+{
+    // 算法编排总入口
+    // TODO: 选手可根据算法需要自行修改
+
+    HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][Orchestrate] count: %u", param.DataDes.count);
+    tag_ = param.tag;
+    algResResp_ = &algRes;
+
+    // User_Input 和 User_Output 指针
+    u8 *userInputPtr = static_cast<u8 *>(param.inputPtr);
+    u8 *userOutputPtr = static_cast<u8 *>(param.outputPtr);
+    CHK_PTR_NULL(userInputPtr);
+    CHK_PTR_NULL(userOutputPtr);
+
+    u32 unitSize = SIZE_TABLE[param.DataDes.dataType];
+    u64 maxCountPerLoop = CalcLoopMaxCount(algRes.cclInputMem.size(), unitSize);
+
+    // 循环处理数据
+    for (u64 countLeft = param.DataDes.count, curCount = 0, inputOffset = 0, outputOffset = 0; countLeft > 0;) {
+        curCount = (countLeft > maxCountPerLoop) ? maxCountPerLoop : countLeft;
+        u64 curSize = curCount * unitSize; // curSize 为三种数据量：512K/2M/64M
+
+        // 构造本次循环所使用的内存信息
+        ExecMem execMem;
+        execMem.count = curCount;                         // 本次循环处理的数据量
+        execMem.inputPtr = userInputPtr + inputOffset;    // 本次循环使用的 User_Input 内存指针
+        execMem.outputPtr = userOutputPtr + outputOffset; // 本次循环使用的 User_Output 内存指针
+        execMem.inputMem = algRes.cclInputMem;            // 本端的 CCL_Input 内存
+        execMem.outputMem = algRes.cclOutputMem;          // 本端的 CCL_Output 内存
+        execMem.scratchMem = algRes.scratchMem;           // 本端的 Scratch 内存
+
+        // 处理本次循环
+        CHK_RET(KernelRun(param, execMem));
+
+        // 更新偏移量
+        countLeft -= curCount;
+        inputOffset = curSize;
+        outputOffset = curSize;
+    }
+    return HCCL_SUCCESS;
+}
+
+HcclResult CollCustomSmallAllReduceMeshExecutor::KernelRun(const OpParam &param, ExecMem &execMem)
+{
+    // 处理单次循环的数据
+    // TODO: 选手可根据算法需要自行修改
+
+    u32 unitSize = SIZE_TABLE[param.DataDes.dataType]; // 数据类型的字节数
+    u64 curSize = execMem.count * unitSize; // 本次循环需要处理的数据大小，三种数据量：512K/2m/64m，单位：字节
+    hccl::Stream &masterStream = const_cast<hccl::Stream &>(param.stream); // 主流
+
+    // TODO: 流同步
+
+    CHK_RET(CheckCommSize(COMM_LEVEL0, COMM_INDEX_0 + 1));
+    SubCommInfo level0CommInfo = GetSubCommInfo(COMM_LEVEL0, COMM_INDEX_0);
+    HCCL_WARNING("[HCCLContest][CollCustomSmallAllReduceMeshExecutor][KernelRun] localRank: %u, localRankSize: %u",
+                 level0CommInfo.localRank, level0CommInfo.localRankSize);
+
+    // TODO: 搬运数据
+
+    return HCCL_SUCCESS;
+}
+
+REGISTER_EXEC("CustomSmallAllReduceMeshExecutor", CustomSmallAllReduceMesh, CollCustomSmallAllReduceMeshExecutor);
+} // namespace hccl
diff --git a/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..f373a0e2552bfa697f6a34d91d8d4933d02454b1
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce/coll_custom_small_all_reduce_mesh_executor.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef COLL_CUSTOM_SMALL_ALLREDUCE_MESH_EXECUTOR_H
+#define COLL_CUSTOM_SMALL_ALLREDUCE_MESH_EXECUTOR_H
+
+#include "coll_comm_executor.h"
+
+namespace hccl {
+class CollCustomSmallAllReduceMeshExecutor : public CollCommExecutor {
+public:
+    CollCustomSmallAllReduceMeshExecutor(const HcclDispatcher dispatcher, std::unique_ptr<TopoMatcher> &topoMatcher);
+    ~CollCustomSmallAllReduceMeshExecutor() = default;
+
+private:
+    /* *************** 资源计算 *************** */
+    HcclResult CalcScratchMemSize(u64 &scratchMemSize) override;
+    HcclResult CalcStreamNum(u32 &streamNum) override;
+    HcclResult CalcNotifyNum(u32 streamNum, u32 &notifyNum) override;
+    HcclResult CalcCommInfo(std::vector<LevelNSubCommTransport> &opTransport) override;
+    u64 CalcLoopMaxCount(const u64 cclBuffSize, const u32 unitSize);
+
+    /* *************** 算法编排 *************** */
+    HcclResult Orchestrate(OpParam &param, AlgResourceResponse &algRes);
+    HcclResult KernelRun(const OpParam &param, ExecMem &execMem) override;
+};
+} // namespace hccl
+
+#endif
diff --git a/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt b/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt
index ccf812fc1fcb2faf7b662bdca2e88af2d7a22fc6..b1726cdc33ad5a3b00f7599a09ace3dc80e20c7a 100644
--- a/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt
+++ b/src/domain/collective_communication/algorithm/impl/operator/CMakeLists.txt
@@ -13,6 +13,7 @@ set(src_list
     ${CMAKE_CURRENT_SOURCE_DIR}/send_operator.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/receive_operator.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/batch_write_operator.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/custom_all_reduce_operator.cc
 )
 
 target_sources(hccl_alg PRIVATE
diff --git a/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc b/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc
index 626018dcfae059a434d159012a11adfc29a8901d..76bfd01ca32e1478b6ac2288d5e432fb543cb410 100644
--- a/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc
+++ b/src/domain/collective_communication/algorithm/impl/operator/all_reduce_operator.cc
@@ -599,6 +599,6 @@ HcclResult AllReduceOperator::SelectAlgfor91093(const OpParam& param, std::strin
     return HCCL_SUCCESS;
 }
 
-REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, AllReduceOperator);
+// REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, AllReduceOperator);
 
 }
diff --git a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66dface0e95568ae535d3fe3202695de8ae29b7e
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "custom_all_reduce_operator.h"
+#include "coll_alg_op_registry.h"
+
+namespace hccl {
+
+CustomAllReduceOperator::CustomAllReduceOperator(AlgConfigurator *algConfigurator, CCLBufferManager &cclBufferManager,
+                                                 HcclDispatcher dispatcher, std::unique_ptr<TopoMatcher> &topoMatcher)
+    : CollAlgOperator(algConfigurator, cclBufferManager, dispatcher, topoMatcher, HcclCMDType::HCCL_CMD_ALLREDUCE)
+{
+}
+
+CustomAllReduceOperator::~CustomAllReduceOperator() {}
+
+HcclResult CustomAllReduceOperator::SelectAlg(const std::string &tag, const OpParam &param, std::string &algName,
+                                              std::string &newTag)
+{
+    constexpr u64 HCCL_CONTEST_SMALL_COUNT = 512 * 1024;       // 512KB
+    constexpr u64 HCCL_CONTEST_MEDIUM_COUNT = 2 * 1024 * 1024; // 2MB
+    constexpr u64 HCCL_CONTEST_HUGE_COUNT = 64 * 1024 * 1024;  // 64MB
+
+    // 算法选择逻辑
+    // TODO: 选手可根据数据量大小选择合适的 Executor
+    // 注意:
+    // 1. 相同算法在不同数据量下的性能不同
+    // 2. 选手可以先只实现一个 Executor，算法选择时直接设置 algName 为该 Executor 的名字
+
+    u32 unitSize = SIZE_TABLE[param.DataDes.dataType];
+    u64 dataSize = param.DataDes.count * unitSize; // 单位：字节，三种数据量：512K/2M/64M
+    if (dataSize <= HCCL_CONTEST_SMALL_COUNT) {
+        algName = "CustomSmallAllReduceMeshExecutor";
+    } else if (dataSize <= HCCL_CONTEST_MEDIUM_COUNT) {
+        algName = "CustomMediumAllReduceMeshExecutor";
+    } else {
+        algName = "CustomHugeAllReduceMeshExecutor";
+    }
+    return HCCL_SUCCESS;
+}
+
+// 注册算子
+REGISTER_OP(HcclCMDType::HCCL_CMD_ALLREDUCE, AllReduce, CustomAllReduceOperator);
+} // namespace hccl
diff --git a/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..41ae73d0b325fbfcc8d0fb6e41073f0fb2e9e9da
--- /dev/null
+++ b/src/domain/collective_communication/algorithm/impl/operator/custom_all_reduce_operator.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef CUSTOM_ALL_REDUCE_OPERATOR_H
+#define CUSTOM_ALL_REDUCE_OPERATOR_H
+
+#include "coll_alg_operator.h"
+
+namespace hccl {
+// 数据规模分类
+enum class HcclDataCountType { HCCL_COUNT_SMALL = 0, HCCL_COUNT_MEDIUM, HCCL_COUNT_HUGE, HCCL_COUNT_RESERVED };
+
+class CustomAllReduceOperator : public CollAlgOperator {
+public:
+    CustomAllReduceOperator(AlgConfigurator *algConfigurator, CCLBufferManager &cclBufferManager,
+                            HcclDispatcher dispatcher, std::unique_ptr<TopoMatcher> &topoMatcher);
+
+    ~CustomAllReduceOperator();
+
+    HcclResult SelectAlg(const std::string &tag, const OpParam &param, std::string &algName,
+                         std::string &newTag) override;
+};
+} // namespace hccl
+#endif
diff --git a/submit.sh b/submit.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b7b0d0ab5dcdf15b34c94e8e4b9997d974b64511
--- /dev/null
+++ b/submit.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+
+# bash fonts colors
+red='\e[31m'
+yellow='\e[33m'
+green='\e[92m'
+none='\e[0m'
+
+error() { echo -e "${red}$*${none}" && exit 1; }
+warning() { echo -e "${yellow}$*${none}"; }
+info() { echo -e "${green}$*${none}"; }
+
+src_dir="/home/hccluser/cann-hccl"
+dst_dir="/result"
+
+operator_dir="src/domain/collective_communication/algorithm/impl/operator"
+executor_dir="src/domain/collective_communication/algorithm/impl/coll_executor/coll_all_reduce"
+
+files=(
+    "${operator_dir}/custom_all_reduce_operator.h"
+    "${operator_dir}/custom_all_reduce_operator.cc"
+    "${executor_dir}/coll_custom_small_all_reduce_mesh_executor.h"
+    "${executor_dir}/coll_custom_small_all_reduce_mesh_executor.cc"
+    "${executor_dir}/coll_custom_medium_all_reduce_mesh_executor.h"
+    "${executor_dir}/coll_custom_medium_all_reduce_mesh_executor.cc"
+    "${executor_dir}/coll_custom_huge_all_reduce_mesh_executor.h"
+    "${executor_dir}/coll_custom_huge_all_reduce_mesh_executor.cc"
+)
+
+for file in "${files[@]}"; do
+    file_path="${src_dir}/${file}"
+    if [ -f "${file_path}" ]; then
+        cp -i "${file_path}" "${dst_dir}"
+        info "Copied: ${file_path} to ${dst_dir}"
+    else
+        error "No such file: ${file_path}"
+    fi
+done
+
+info "All files copied successfully to ${dst_dir}"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bbd859a0a1b8c3c909b22ca164e61fe8bac762de..8475b050c67bc7d31151d64bc486044d4e8e236b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -418,6 +418,7 @@ set(src_list_alg
     ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/registry/coll_alg_op_registry.cc
     ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/coll_alg_operator.cc
     ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_reduce_operator.cc
+    ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/custom_all_reduce_operator.cc
     ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_gather_operator.cc
     ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/all_gather_v_operator.cc
     ${HCCL_OPEN_CODE_ALGORITHM}/impl/operator/reduce_scatter_operator.cc
@@ -477,8 +478,7 @@ target_compile_options(hccl_alg_test PRIVATE
     -fno-strict-aliasing
     -pipe
     -std=c++14
-    -Os
-    -O2
+    -O0 -g
     -fstack-protector-all
     $<$<STREQUAL:${ENABLE_ASAN},true>:-fsanitize=address -fsanitize-recover=address,all -fno-omit-frame-pointer -g>
 )
@@ -521,9 +521,10 @@ add_custom_target(hccl_alg_test_lib
     COMMAND cd ${CMAKE_INSTALL_PREFIX}/hccl_lib
 )
 
-add_custom_command(TARGET hccl_alg_test POST_BUILD
-    COMMAND ${CMAKE_STRIP} $<TARGET_FILE:hccl_alg_test>
-)
+# 禁用 strip
+# add_custom_command(TARGET hccl_alg_test POST_BUILD
+#     COMMAND ${CMAKE_STRIP} $<TARGET_FILE:hccl_alg_test>
+# )
 
 install(TARGETS hccl_alg_test
     LIBRARY DESTINATION lib OPTIONAL
diff --git a/test/algorithm/testcase/main.cc b/test/algorithm/testcase/main.cc
index db0348934bb91d55ae548c96ff7623af97950ea4..74d7e921f7c4e28ecb4233882d2772a884b8dabd 100644
--- a/test/algorithm/testcase/main.cc
+++ b/test/algorithm/testcase/main.cc
@@ -2,7 +2,7 @@
 
 GTEST_API_ int main(int argc, char **argv) {
     // testcase调试代码，只跑特定的用例
-    //testing::GTEST_FLAG(filter) = "AllReduceTest.allreduce_cyw_test";
+    testing::GTEST_FLAG(filter) = "AllReduceTest.allreduce_contest_test*";
     testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
 }
diff --git a/test/algorithm/testcase/testcase_all_reduce.cc b/test/algorithm/testcase/testcase_all_reduce.cc
index 7dc31ad1ebe5cc70593e73665371ef611ef8a618..e3f10e34a4135e08d828173674cf7257e5f82dc0 100644
--- a/test/algorithm/testcase/testcase_all_reduce.cc
+++ b/test/algorithm/testcase/testcase_all_reduce.cc
@@ -1751,4 +1751,394 @@ TEST_F(AllReduceTest, allreduce_aiv_determinstic_test)
     ret = checker.Check(checkerOpParam, topoMeta);
 
     // EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
-}
\ No newline at end of file
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_int8)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 512k + int8
+    u64 size = 512 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_INT8;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_int8)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 2m + int8
+    u64 size = 2 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_INT8;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_int8)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 64m + int8
+    u64 size = 64 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_INT8;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_int8)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 1g + int8
+    u64 size = 1 * 1024 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_INT8;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_int8)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 4g + int8
+    u64 size = 4LLU * 1024 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_INT8;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_fp16)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 512k + fp16
+    u64 size = 512 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP16;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_fp16)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 2m + fp16
+    u64 size = 2 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP16;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_fp16)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 64m + fp16
+    u64 size = 64 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP16;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_fp16)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 1g + fp16
+    u64 size = 1 * 1024 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP16;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp16)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 4g + fp16
+    u64 size = 4LLU * 1024 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP16;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_512k_fp32)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 512k + fp32
+    u64 size = 512 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP32;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_2m_fp32)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 2m + fp32
+    u64 size = 2 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP32;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_64m_fp32)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 64m + fp32
+    u64 size = 64 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP32;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_1g_fp32)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 1g + fp32
+    u64 size = 1 * 1024 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP32;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}
+
+TEST_F(AllReduceTest, allreduce_contest_test_910b_4g_fp32)
+{
+    // 拓扑：单机 8 卡
+    RankTable_For_LLT gen;
+    TopoMeta topoMeta;
+    gen.GenTopoMeta(topoMeta, 1, 1, 8);
+
+    // 4g + fp32
+    u64 size = 4LLU * 1024 * 1024 * 1024;
+    auto dataType = CheckerDataType::DATA_TYPE_FP32;
+    CheckerOpParam checkerOpParam;
+    checkerOpParam.opType = CheckerOpType::ALLREDUCE;
+    checkerOpParam.tag = "AllReduce";
+    checkerOpParam.opMode = CheckerOpMode::OPBASE;
+    checkerOpParam.DataDes.count = size / SIZE_TABLE[dataType];
+    checkerOpParam.DataDes.dataType = dataType;
+    checkerOpParam.devtype = CheckerDevType::DEV_TYPE_910B;
+
+    Checker checker;
+    HcclResult ret;
+    checker.CloseRankMemCheck();
+    checker.EnableTaskPrint();
+    ret = checker.Check(checkerOpParam, topoMeta);
+    EXPECT_EQ(ret, HcclResult::HCCL_SUCCESS);
+}