From f9e80dcb4b10f2839768d50da022fa88b37263ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=9A=E5=A4=A9?= <1581528722@qq.com>
Date: Wed, 15 Oct 2025 03:46:50 +0000
Subject: [PATCH 1/6] =?UTF-8?q?!683=20Extended=20Template:=20reduceMax=20&?=
 =?UTF-8?q?=20reduceMin=20Merge=20pull=20request=20!683=20from=20=E9=99=88?=
 =?UTF-8?q?=E5=BA=9A=E5=A4=A9/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 atvc/examples/reduce_max/README.md           |  46 +++
 atvc/examples/reduce_max/reduce_max.cpp      | 177 ++++++++++
 atvc/examples/reduce_min/README.md           |  46 +++
 atvc/examples/reduce_min/reduce_min.cpp      | 177 ++++++++++
 atvc/include/atvc.h                          |   2 +
 atvc/include/reduce/common/reduce_common.h   |  12 +
 atvc/include/reduce/reduce_device.h          |   2 +
 atvc/include/reduce/reduce_max.h             | 349 +++++++++++++++++++
 atvc/include/reduce/reduce_min.h             | 349 +++++++++++++++++++
 atvc/include/reduce/reduce_op_template.h     |   4 +-
 atvc/include/reduce/reduce_sum.h             |  15 +-
 atvc/include/reduce/utils/reduce_block_aux.h |  21 +-
 atvc/include/reduce/utils/reduce_buf_pool.h  |  22 +-
 13 files changed, 1199 insertions(+), 23 deletions(-)
 create mode 100644 atvc/examples/reduce_max/README.md
 create mode 100644 atvc/examples/reduce_max/reduce_max.cpp
 create mode 100644 atvc/examples/reduce_min/README.md
 create mode 100644 atvc/examples/reduce_min/reduce_min.cpp
 create mode 100644 atvc/include/reduce/reduce_max.h
 create mode 100644 atvc/include/reduce/reduce_min.h
diff --git a/atvc/examples/reduce_max/README.md b/atvc/examples/reduce_max/README.md
new file mode 100644
index 00000000..358fa460
--- /dev/null
+++ b/atvc/examples/reduce_max/README.md
@@ -0,0 +1,46 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了利用ATVC实现ReduceMax单算子并完成功能验证
+
+
+## 样例支持产品型号：
+- Atlas A2训练系列产品
+
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [reduce_max.cpp](./reduce_max.cpp) | ReduceMax算子代码实现以及调用样例               |
+
+## 算子描述
+
+ReduceMax是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
+ReduceMax算子规格：
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ReduceMax</td></tr>
+
+<tr><td rowspan="4" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr></tr>
+
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">ReduceCustom</td></tr>
+</table>
+
+## 算子运行
+在ascendc-api-adv代码仓目录下执行：
+```bash
+$ cd ./atvc/examples
+$ bash run_examples.sh reduce_max
+...
+Generate golden data successfully.
+...
+Accuracy verification passed.
+```
\ No newline at end of file
diff --git a/atvc/examples/reduce_max/reduce_max.cpp b/atvc/examples/reduce_max/reduce_max.cpp
new file mode 100644
index 00000000..c5ce2bce
--- /dev/null
+++ b/atvc/examples/reduce_max/reduce_max.cpp
@@ -0,0 +1,177 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <random>
+#include <iostream>
+#include <algorithm>
+#include "acl/acl.h"
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
+#include "example_common.h"
+
+namespace {
+// ReduceSum算子的描述：一个输入，一个输出，类型均为float
+using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+
+void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
+{
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+}
+}
+
+/*
+ * 该函数为ReduceCustom算子核函数入口
+ * x                 Device上的gm地址，指向Add算子第一个输入
+ * y                 Device上的gm地址，指向Add算子第一个输出
+ * reduceParam       指向运行态ATVC::ReduceParam数据
+*/
+template<typename Traits, const auto& Policy>
+__global__ __aicore__ void ReduceCustom(
+    GM_ADDR x, 
+    GM_ADDR y, 
+    ATVC::ReduceParam reduceParam
+)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceMaxCompute<Traits>, Policy>();
+    op.Run(x, y, &reduceParam);
+}
+
+namespace {
+// 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
+template<class OpTraits>
+void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam &param, ATVC::ReducePolicy &policy, aclrtStream& stream)
+{
+    // 申请临时空间workspace
+    uint8_t *workspaceDevice;
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    param.workspaceAddr = reinterpret_cast<uint64_t>(workspaceDevice);
+    // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+    if (policy == ATVC::REDUCE_POLICY0) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY1) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY1><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY2) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY2><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY3) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY3><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY4) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY4><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY5) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY5><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY6) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY6><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY7) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY7><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY8) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY8><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY9) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY9><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY10) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY10><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY11) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY11><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY12) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY12><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY13) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY13><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY14) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY14><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY15) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY15><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY16) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY16><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY17) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY17><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY18) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY18><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY19) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY19><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY20) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY20><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY21) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY21><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY22) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY22><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else {
+        (void)printf("[ERROR]: Cannot find any matched policy.\n");
+    }
+    // 流同步后释放申请的param内存
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+}
+}
+
+int32_t main(int32_t argc, char* argv[])
+{
+    if (!ATVC::Host::DebugCheck<ReduceOpTraits, ATVC::TemplateType::REDUCE>()) {
+        (void)printf("[ERROR]: Reduce OpTraits check failed.\n");
+        return -1;
+    }
+    int32_t eleNum = 8 * 1024;
+    int32_t outEleNum = 1 * 1024;
+    size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
+    size_t outputByteSize = static_cast<size_t>(outEleNum) * sizeof(float);
+    std::vector<int64_t> dim{0};            // 对第0轴执行reduce操作
+    std::vector<int64_t> shape{8, 1024};    // 测试输入shape
+    std::vector<float> inputX(eleNum, 1.0f);
+    std::vector<float> golden(outEleNum, 1.0f);
+    (void)printf("Generate golden data successfully.\n");
+    // 初始化Acl资源
+    CHECK_ACL(aclInit(nullptr));
+    aclrtContext context;
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+    uint8_t *yHost;
+    uint8_t *xDevice;
+    uint8_t *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    ATVC::Host::ReduceTilingHyperParam hyperParam;
+    hyperParam.maxInnerA = 256;// 设置maxInnerA为256
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param, hyperParam=hyperParam)) {
+        (void)printf("Reduce tiling error.\n");
+        return -1;
+    };
+
+    // 调用Adapter调度接口，完成核函数的模板调用
+    ReduceOpAdapter<ReduceOpTraits>(xDevice, yDevice, param, policy, stream);
+
+    CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    std::vector<float> outputY(reinterpret_cast<float*>(yHost), reinterpret_cast<float*>(yHost) + outEleNum);
+
+    // 释放Acl资源
+    CleanUp(xDevice, yDevice, yHost);
+    CleanACL(stream, context, deviceId);
+
+    if (!VerifyResults(golden, outputY)) {
+        return -1;
+    }
+
+    (void)printf("Accuracy verification passed.\n");
+    return 0;
+}
diff --git a/atvc/examples/reduce_min/README.md b/atvc/examples/reduce_min/README.md
new file mode 100644
index 00000000..af85b2bc
--- /dev/null
+++ b/atvc/examples/reduce_min/README.md
@@ -0,0 +1,46 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了利用ATVC实现ReduceMin单算子并完成功能验证
+
+
+## 样例支持产品型号：
+- Atlas A2训练系列产品
+
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [reduce_min.cpp](./reduce_min.cpp) | ReduceMin算子代码实现以及调用样例               |
+
+## 算子描述
+
+ReduceMin是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
+ReduceMin算子规格：
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ReduceMin</td></tr>
+
+<tr><td rowspan="4" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr></tr>
+
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">ReduceCustom</td></tr>
+</table>
+
+## 算子运行
+在ascendc-api-adv代码仓目录下执行：
+```bash
+$ cd ./atvc/examples
+$ bash run_examples.sh reduce_min
+...
+Generate golden data successfully.
+...
+Accuracy verification passed.
+```
\ No newline at end of file
diff --git a/atvc/examples/reduce_min/reduce_min.cpp b/atvc/examples/reduce_min/reduce_min.cpp
new file mode 100644
index 00000000..525f57e9
--- /dev/null
+++ b/atvc/examples/reduce_min/reduce_min.cpp
@@ -0,0 +1,177 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <random>
+#include <iostream>
+#include <algorithm>
+#include "acl/acl.h"
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
+#include "example_common.h"
+
+namespace {
+// ReduceSum算子的描述：一个输入，一个输出，类型均为float
+using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+
+void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
+{
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+}
+}
+
+/*
+ * 该函数为ReduceCustom算子核函数入口
+ * x                 Device上的gm地址，指向Add算子第一个输入
+ * y                 Device上的gm地址，指向Add算子第一个输出
+ * reduceParam       指向运行态ATVC::ReduceParam数据
+*/
+template<typename Traits, const auto& Policy>
+__global__ __aicore__ void ReduceCustom(
+    GM_ADDR x, 
+    GM_ADDR y, 
+    ATVC::ReduceParam reduceParam
+)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceMinCompute<Traits>, Policy>();
+    op.Run(x, y, &reduceParam);
+}
+
+namespace {
+// 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
+template<class OpTraits>
+void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam &param, ATVC::ReducePolicy &policy, aclrtStream& stream)
+{
+    // 申请临时空间workspace
+    uint8_t *workspaceDevice;
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    param.workspaceAddr = reinterpret_cast<uint64_t>(workspaceDevice);
+    // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+    if (policy == ATVC::REDUCE_POLICY0) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY1) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY1><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY2) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY2><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY3) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY3><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY4) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY4><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY5) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY5><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY6) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY6><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY7) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY7><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY8) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY8><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY9) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY9><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY10) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY10><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY11) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY11><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY12) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY12><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY13) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY13><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY14) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY14><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY15) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY15><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY16) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY16><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY17) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY17><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY18) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY18><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY19) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY19><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY20) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY20><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY21) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY21><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY22) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY22><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else {
+        (void)printf("[ERROR]: Cannot find any matched policy.\n");
+    }
+    // 流同步后释放申请的param内存
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+}
+}
+
+int32_t main(int32_t argc, char* argv[])
+{
+    if (!ATVC::Host::DebugCheck<ReduceOpTraits, ATVC::TemplateType::REDUCE>()) {
+        (void)printf("[ERROR]: Reduce OpTraits check failed.\n");
+        return -1;
+    }
+    int32_t eleNum = 8 * 1024;
+    int32_t outEleNum = 1 * 1024;
+    size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
+    size_t outputByteSize = static_cast<size_t>(outEleNum) * sizeof(float);
+    std::vector<int64_t> dim{0};            // 对第0轴执行reduce操作
+    std::vector<int64_t> shape{8, 1024};    // 测试输入shape
+    std::vector<float> inputX(eleNum, 1.0f);
+    std::vector<float> golden(outEleNum, 1.0f);
+    (void)printf("Generate golden data successfully.\n");
+    // 初始化Acl资源
+    CHECK_ACL(aclInit(nullptr));
+    aclrtContext context;
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+    uint8_t *yHost;
+    uint8_t *xDevice;
+    uint8_t *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    ATVC::Host::ReduceTilingHyperParam hyperParam;
+    hyperParam.maxInnerA = 256;// 设置maxInnerA为256
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param, hyperParam=hyperParam)) {
+        (void)printf("Reduce tiling error.\n");
+        return -1;
+    };
+
+    // 调用Adapter调度接口，完成核函数的模板调用
+    ReduceOpAdapter<ReduceOpTraits>(xDevice, yDevice, param, policy, stream);
+
+    CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    std::vector<float> outputY(reinterpret_cast<float*>(yHost), reinterpret_cast<float*>(yHost) + outEleNum);
+
+    // 释放Acl资源
+    CleanUp(xDevice, yDevice, yHost);
+    CleanACL(stream, context, deviceId);
+
+    if (!VerifyResults(golden, outputY)) {
+        return -1;
+    }
+
+    (void)printf("Accuracy verification passed.\n");
+    return 0;
+}
diff --git a/atvc/include/atvc.h b/atvc/include/atvc.h
index 066054c2..e9b09c90 100644
--- a/atvc/include/atvc.h
+++ b/atvc/include/atvc.h
@@ -28,6 +28,8 @@
 #include "common/kernel_utils.h"
 #include "elewise/elewise_op_template.h"
 #include "reduce/reduce_sum.h"
+#include "reduce/reduce_max.h"
+#include "reduce/reduce_min.h"
 #include "reduce/reduce_op_template.h"
 #include "broadcast/broadcast_compute.h"
 #include "broadcast/broadcast_op_template.h"
diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h
index d85353e8..69d21aa4 100644
--- a/atvc/include/reduce/common/reduce_common.h
+++ b/atvc/include/reduce/common/reduce_common.h
@@ -28,6 +28,18 @@ enum ShapeDim {
     DIM_BROADCAST   // Broadcast axis
 };
 
+struct ReduceARParam {
+    uint32_t repStride = 0;
+    uint16_t dimA = 0;
+    uint16_t dimMax = 0;
+    uint16_t mainR = 0;
+    uint16_t tailR = 0;
+    uint64_t maskAddRNum = 0;
+    uint16_t loopRNum = 0;
+    uint16_t dtypeSize = 0;
+    uint16_t dimR = 0;
+};
+
 namespace AR_PATTERN {
     static constexpr uint32_t A = 100;
     static constexpr uint32_t AR = 11;
diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h
index ac454343..c30a69ab 100644
--- a/atvc/include/reduce/reduce_device.h
+++ b/atvc/include/reduce/reduce_device.h
@@ -24,6 +24,8 @@
 
 #include "common/kernel_utils.h"
 #include "reduce/reduce_sum.h"
+#include "reduce/reduce_max.h"
+#include "reduce/reduce_min.h"
 #include "reduce/reduce_op_template.h"
 
 #endif // ATVC_REDUCE_DEVICE_H
\ No newline at end of file
diff --git a/atvc/include/reduce/reduce_max.h b/atvc/include/reduce/reduce_max.h
new file mode 100644
index 00000000..abb54046
--- /dev/null
+++ b/atvc/include/reduce/reduce_max.h
@@ -0,0 +1,349 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATVC_REDUCE_MAX_H
+#define ATVC_REDUCE_MAX_H
+
+#include "common/kernel_utils.h"
+#include "reduce/common/patterns.h"
+#include "reduce/utils/reduce_block_aux_util.h"
+#include "reduce/common/reduce_common.h"
+
+namespace ATVC {
+/*!
+ * ReduceMaxCompute This class provides the core arithmetic required to reduce
+ * tensors along either the inner-most (AR) or outer-most (RA) axis after
+ * the tensor has been copied to the Unified Buffer (UB).  Data movement between
+ * Global Memory (GM) and UB is not handled here; it is the responsibility of
+ * the surrounding scheduling template.
+ */
+template <typename OpTraits>
+class ReduceMaxCompute {
+public:
+    // Extract operator input description information from OpTraits
+    using inputDTypeList = typename OpTraits::In::types;
+    using DataType = typename ATVC::TypeListGet<inputDTypeList, 0>::Type;
+    using PrompteDtype = typename KernelUtils::GetPromoteType<DataType>::T;
+    __aicore__ inline ReduceMaxCompute() {}
+
+    /*!
+     * \brief Perform the actual reduction on a tile already resident in UB.
+     * \tparam needMask, true when UB alignment introduced invalid lanes.
+     * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA.
+     * \param[in] shape, {dimA, dimR} in elements; dimR may be padded.
+     * \param[out] dst, destination tensor (length == dimA)
+     * \param[in] src, source tensor (length == dimA * dimR)
+     */
+    template <bool needMask, class Pattern>
+    __aicore__ inline void
+    Compute(KernelUtils::Shape<2> &shape,
+            const AscendC::LocalTensor<PrompteDtype> &dst,
+            const AscendC::LocalTensor<PrompteDtype> &src)
+    {
+        // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available:
+        // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative
+        //    calculations can only calculate the actual number of effective elements
+        // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface
+        //    implemented by the user)
+        if (std::is_same<Pattern, ReducePattern::AR>::value) {
+            if constexpr (needMask) { // 1. High performance alignment mode
+                // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary
+                // length (number of elements), where dimR is the original number of elements
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.oriBurstLen);
+            } else {
+                // MainR: The framework provides the calculation of the R-axis binary length (number of elements),
+                // where dimR is the number of elements after completion
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.value[1]);
+            }
+        }
+        if (std::is_same<Pattern, ReducePattern::RA>::value) {
+            int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false);
+            ReduceRA(dst, src, shape.value[1], shape.value[0], mainR);
+        }
+    }
+
+    /*!
+     * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis.
+     * \param[out] dst, output tensor (length == dimA)
+     * \param[in] src, input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, length of the non-reduced axis (A)
+     * \param[in] dimR, length of the reduced axis (R)
+     * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller)
+     */
+    __aicore__ inline void
+    ReduceRA(const AscendC::LocalTensor<PrompteDtype> &dst,
+             const AscendC::LocalTensor<PrompteDtype> &src, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR)
+    {
+        uint32_t totalNum = dimR * dimA;
+        uint32_t mainNum = dimA * mainR;
+        constexpr uint32_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t tailNum = totalNum - mainNum;
+        // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes
+        constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
+        uint16_t repeatTimes = tailNum / maskAddNum;
+        uint16_t repeatNum = repeatTimes * maskAddNum;
+        uint16_t repTailNum = tailNum - repeatNum;
+        // Same data block step size between different iterations
+        uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
+        if (repeatTimes > 0) {
+            AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+        }
+        if (repTailNum > 0) {
+            // Same data block step size between different iterations
+            repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+            repeatParams.dstRepStride = repStride;
+            repeatParams.src0RepStride = repStride;
+            repeatParams.src1RepStride = repStride;
+            AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+        uint16_t loopRNum = mainR;
+        while (loopRNum > 1) {
+            loopRNum = loopRNum >> 1;
+            mainNum = loopRNum * dimA; // The first half of LoopR's data volume
+            repeatTimes = mainNum / maskAddNum;
+            repeatNum = repeatTimes * maskAddNum;
+            repTailNum = mainNum - repeatNum;
+            if (repeatTimes > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+            }
+            if (repTailNum > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        }
+        AscendC::DataCopy(dst, src, dimA);
+    }
+
+    /*!
+     * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis.
+     * \param[out] dstTensor, output tensor (length == dimA)
+     * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, length of the non-reduced axis (A)
+     * \param[in] dimR, padded length of the reduced axis (R)
+     * \param[in] mainR, largest power-of-two ≤ original R length
+     * \param[in] oriBurstLen, original (un-padded) R length used to compute tail
+     */
+    __aicore__ inline void
+    ReduceAR(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+             const AscendC::LocalTensor<PrompteDtype> &srcTensor, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen)
+    {
+        uint16_t tailR = oriBurstLen - mainR;
+        constexpr uint16_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32;
+        uint16_t dimMax = dimA * dimR;
+        constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize;
+
+        ReduceARParam param{
+            .repStride   = repStride,
+            .dimA        = dimA,
+            .dimMax      = dimMax,
+            .mainR       = mainR,
+            .tailR       = tailR,
+            .maskAddRNum = maskAddRNum,
+            .dtypeSize   = dtypeSize,
+            .dimR        = dimR
+        };
+
+        if (mainR > 0 && tailR > 0) {
+            PerformInitialMax(srcTensor, param);
+        }
+
+        param.loopRNum = mainR;
+        while (param.loopRNum > maskAddRNum) {
+            param.loopRNum = param.loopRNum / 2U;
+            PerformBinaryReduction(srcTensor, param);
+        }
+        if (param.loopRNum == 0) { // small shape, directly reduce
+            param.loopRNum = tailR;
+        }
+        PerformFinalReduction(dstTensor, srcTensor, param);
+    }
+
+    /*!
+     * \brief Merge the calculation results of different data base blocks within a single UB
+     * \tparam Pattern  Compile-time pattern tag that decides A vs. B orientation.
+     * \tparam V Shape descriptor (encodes dimA and dimB at runtime).
+     * \param[in] index, logical index identifying the data-base block.
+     * \param[in] shape, runtime tensor shape (dimA, dimB).
+     * \param[in] tempBuf, UB tensor serving as the reduction cache.
+     * \param[in] computeRes, UB tensor holding the newest partial result.
+     */
+    template <class Pattern, class V>
+    __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor<PrompteDtype>& tempBuf,
+                                       const AscendC::LocalTensor<PrompteDtype>& computeRes)
+    {
+        int64_t cacheID = KernelUtils::Reduce::GetCacheID(index);
+        int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0];
+        int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype);
+        int64_t stride = OpsUtils::CeilDiv(dimA, static_cast<int64_t>(element_one_repeat)) * element_one_repeat;
+        uint16_t outerLoopTimes = OpsUtils::CeilDiv(
+            static_cast<int64_t>(dimA * sizeof(PrompteDtype)), static_cast<int64_t>(Platform::GetVRegSize()));
+        uint16_t innerLoopTimes = cacheID;
+        uint32_t outerLoopStride = element_one_repeat;
+        uint32_t innerLoopStride = stride;  // The size of each idex block in cacahe and the size of the A-axis
+        AscendC::LocalTensor<PrompteDtype> dstTensor = tempBuf;
+        AscendC::LocalTensor<PrompteDtype> srcTensor = computeRes;
+        uint32_t cah = cacheID * stride;
+
+        for (uint16_t i = 0; i < outerLoopTimes; ++i) {  // OuterLoopTimes is the size of dimA
+            uint32_t srcIdx = i * outerLoopStride;
+            for (uint16_t j = 0; j < innerLoopTimes; ++j) {
+                AscendC::Max(srcTensor[srcIdx], srcTensor[srcIdx],
+                             dstTensor[srcIdx + j * innerLoopStride],
+                             outerLoopStride);
+                AscendC::PipeBarrier<PIPE_V>();
+            }
+            DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride);
+        }
+    }
+
+    /*!
+     * \brief Binary reduction between two UB buffers.
+     * \      Used for inter-core result merging when workspace staging is required.
+     * \param[in] ubTensorLeft, left operand (in-place result).
+     * \param[in] ubTensorRight, right operand (read-only).
+     * \param[in] calCount, number of elements to reduce.
+     */
+    __aicore__ inline void
+    ReduceBetweenUB(const AscendC::LocalTensor<PrompteDtype> &ubTensorLeft,
+                    const AscendC::LocalTensor<PrompteDtype> &ubTensorRight,
+                    const int32_t &calCount)
+    {
+        AscendC::Max(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
+    }
+
+    /*!
+     * \brief Return the value used for padding when UB alignment is required.
+     *        For MAX-reduction the neutral element is -∞ or 0.
+     * \tparam U, scalar type identical to DataType or PromoteDataType.
+     * \return The padding value (-∞ or 0).
+     */
+    template <typename U>
+    __aicore__ inline U GetPaddingValue()
+    {
+        // Due to the fact that ReduceMax accumulates R-axis data, the values of the supplemented elements
+        // are set to -∞ or 0 to ensure that the accumulated result is not affected
+        if(AscendC::IsSameType<U, int32_t>::value){
+            return INT32_MIN;
+        }else if(AscendC::IsSameType<U, uint32_t>::value){
+            return 0;
+        }else{
+            return -1.0f / 0.0f;
+        }
+    }
+
+private:
+    __aicore__ inline void PerformInitialMax(const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum;
+        uint16_t addRTail = param.tailR - addRTotalNum;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
+                AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
+            }
+        } else {
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Max(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Max(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.mainR],
+                    srcTensor[addRTotalNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor<PrompteDtype> &srcTensor,
+        const ReduceARParam& param)
+    {
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) {
+                AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum);
+            }
+        } else {
+            uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum;
+            uint16_t addRTail = param.loopRNum - addRTotalNum;
+            // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+            AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Max(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Max(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.loopRNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+        const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        if constexpr (AscendC::IsSameType<PrompteDtype, float>::value ||
+                      AscendC::IsSameType<PrompteDtype, half>::value) {
+            uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceMax repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMax(
+                    dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        } else if constexpr (AscendC::IsSameType<PrompteDtype, int32_t>::value ||
+                             AscendC::IsSameType<PrompteDtype, uint32_t>::value) {
+            // Cast to float for higher-precision accumulation
+            AscendC::LocalTensor<float> interpreSrc = srcTensor.template ReinterpretCast<float>();
+            AscendC::LocalTensor<float> interpreDst = dstTensor.template ReinterpretCast<float>();
+            AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
+            AscendC::PipeBarrier<PIPE_V>();
+
+            uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceMax repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMax(
+                    interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+            AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
+        }
+    }
+};
+} // namespace ATVC
+
+#endif // ATVC_REDUCE_MAX_H
diff --git a/atvc/include/reduce/reduce_min.h b/atvc/include/reduce/reduce_min.h
new file mode 100644
index 00000000..ad88e5fb
--- /dev/null
+++ b/atvc/include/reduce/reduce_min.h
@@ -0,0 +1,349 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATVC_REDUCE_MIN_H
+#define ATVC_REDUCE_MIN_H
+
+#include "common/kernel_utils.h"
+#include "reduce/common/patterns.h"
+#include "reduce/utils/reduce_block_aux_util.h"
+#include "reduce/common/reduce_common.h"
+
+namespace ATVC {
+/*!
+ * ReduceMinCompute This class provides the core arithmetic required to reduce
+ * tensors along either the inner-most (AR) or outer-most (RA) axis after
+ * the tensor has been copied to the Unified Buffer (UB).  Data movement between
+ * Global Memory (GM) and UB is not handled here; it is the responsibility of
+ * the surrounding scheduling template.
+ */
+template <typename OpTraits>
+class ReduceMinCompute {
+public:
+    // Extract operator input description information from OpTraits
+    using inputDTypeList = typename OpTraits::In::types;
+    using DataType = typename ATVC::TypeListGet<inputDTypeList, 0>::Type;
+    using PrompteDtype = typename KernelUtils::GetPromoteType<DataType>::T;
+    __aicore__ inline ReduceMinCompute() {}
+
+    /*!
+     * \brief Perform the actual reduction on a tile already resident in UB.
+     * \tparam needMask, true when UB alignment introduced invalid lanes.
+     * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA.
+     * \param[in] shape, {dimA, dimR} in elements; dimR may be padded.
+     * \param[out] dst, destination tensor (length == dimA)
+     * \param[in] src, source tensor (length == dimA * dimR)
+     */
+    template <bool needMask, class Pattern>
+    __aicore__ inline void
+    Compute(KernelUtils::Shape<2> &shape,
+            const AscendC::LocalTensor<PrompteDtype> &dst,
+            const AscendC::LocalTensor<PrompteDtype> &src)
+    {
+        // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available:
+        // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative
+        //    calculations can only calculate the actual number of effective elements
+        // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface
+        //    implemented by the user)
+        if (std::is_same<Pattern, ReducePattern::AR>::value) {
+            if constexpr (needMask) { // 1. High performance alignment mode
+                // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary
+                // length (number of elements), where dimR is the original number of elements
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.oriBurstLen);
+            } else {
+                // MainR: The framework provides the calculation of the R-axis binary length (number of elements),
+                // where dimR is the number of elements after completion
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.value[1]);
+            }
+        }
+        if (std::is_same<Pattern, ReducePattern::RA>::value) {
+            int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false);
+            ReduceRA(dst, src, shape.value[1], shape.value[0], mainR);
+        }
+    }
+
+    /*!
+     * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis.
+     * \param[out] dst, output tensor (length == dimA)
+     * \param[in] src, input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, length of the non-reduced axis (A)
+     * \param[in] dimR, length of the reduced axis (R)
+     * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller)
+     */
+    __aicore__ inline void
+    ReduceRA(const AscendC::LocalTensor<PrompteDtype> &dst,
+             const AscendC::LocalTensor<PrompteDtype> &src, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR)
+    {
+        uint32_t totalNum = dimR * dimA;
+        uint32_t mainNum = dimA * mainR;
+        constexpr uint32_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t tailNum = totalNum - mainNum;
+        // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes
+        constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
+        uint16_t repeatTimes = tailNum / maskAddNum;
+        uint16_t repeatNum = repeatTimes * maskAddNum;
+        uint16_t repTailNum = tailNum - repeatNum;
+        // Same data block step size between different iterations
+        uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
+        if (repeatTimes > 0) {
+            AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+        }
+        if (repTailNum > 0) {
+            // Same data block step size between different iterations
+            repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+            repeatParams.dstRepStride = repStride;
+            repeatParams.src0RepStride = repStride;
+            repeatParams.src1RepStride = repStride;
+            AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+        uint16_t loopRNum = mainR;
+        while (loopRNum > 1) {
+            loopRNum = loopRNum >> 1;
+            mainNum = loopRNum * dimA; // The first half of LoopR's data volume
+            repeatTimes = mainNum / maskAddNum;
+            repeatNum = repeatTimes * maskAddNum;
+            repTailNum = mainNum - repeatNum;
+            if (repeatTimes > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+            }
+            if (repTailNum > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        }
+        AscendC::DataCopy(dst, src, dimA);
+    }
+
+    /*!
+     * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis.
+     * \param[out] dstTensor, output tensor (length == dimA)
+     * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, length of the non-reduced axis (A)
+     * \param[in] dimR, padded length of the reduced axis (R)
+     * \param[in] mainR, largest power-of-two ≤ original R length
+     * \param[in] oriBurstLen, original (un-padded) R length used to compute tail
+     */
+    __aicore__ inline void
+    ReduceAR(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+             const AscendC::LocalTensor<PrompteDtype> &srcTensor, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen)
+    {
+        uint16_t tailR = oriBurstLen - mainR;
+        constexpr uint16_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32;
+        uint16_t dimMax = dimA * dimR;
+        constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize;
+
+        ReduceARParam param{
+            .repStride   = repStride,
+            .dimA        = dimA,
+            .dimMax      = dimMax,
+            .mainR       = mainR,
+            .tailR       = tailR,
+            .maskAddRNum = maskAddRNum,
+            .dtypeSize   = dtypeSize,
+            .dimR        = dimR
+        };
+
+        if (mainR > 0 && tailR > 0) {
+            PerformInitialMin(srcTensor, param);
+        }
+
+        param.loopRNum = mainR;
+        while (param.loopRNum > maskAddRNum) {
+            param.loopRNum = param.loopRNum / 2U;
+            PerformBinaryReduction(srcTensor, param);
+        }
+        if (param.loopRNum == 0) { // small shape, directly reduce
+            param.loopRNum = tailR;
+        }
+        PerformFinalReduction(dstTensor, srcTensor, param);
+    }
+
+    /*!
+     * \brief Merge the calculation results of different data base blocks within a single UB
+     * \tparam Pattern  Compile-time pattern tag that decides A vs. B orientation.
+     * \tparam V Shape descriptor (encodes dimA and dimB at runtime).
+     * \param[in] index, logical index identifying the data-base block.
+     * \param[in] shape, runtime tensor shape (dimA, dimB).
+     * \param[in] tempBuf, UB tensor serving as the reduction cache.
+     * \param[in] computeRes, UB tensor holding the newest partial result.
+     */
+    template <class Pattern, class V>
+    __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor<PrompteDtype>& tempBuf,
+                                       const AscendC::LocalTensor<PrompteDtype>& computeRes)
+    {
+        int64_t cacheID = KernelUtils::Reduce::GetCacheID(index);
+        int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0];
+        int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype);
+        int64_t stride = OpsUtils::CeilDiv(dimA, static_cast<int64_t>(element_one_repeat)) * element_one_repeat;
+        uint16_t outerLoopTimes = OpsUtils::CeilDiv(
+            static_cast<int64_t>(dimA * sizeof(PrompteDtype)), static_cast<int64_t>(Platform::GetVRegSize()));
+        uint16_t innerLoopTimes = cacheID;
+        uint32_t outerLoopStride = element_one_repeat;
+        uint32_t innerLoopStride = stride;  // The size of each idex block in cacahe and the size of the A-axis
+        AscendC::LocalTensor<PrompteDtype> dstTensor = tempBuf;
+        AscendC::LocalTensor<PrompteDtype> srcTensor = computeRes;
+        uint32_t cah = cacheID * stride;
+
+        for (uint16_t i = 0; i < outerLoopTimes; ++i) {  // OuterLoopTimes is the size of dimA
+            uint32_t srcIdx = i * outerLoopStride;
+            for (uint16_t j = 0; j < innerLoopTimes; ++j) {
+                AscendC::Min(srcTensor[srcIdx], srcTensor[srcIdx],
+                             dstTensor[srcIdx + j * innerLoopStride],
+                             outerLoopStride);
+                AscendC::PipeBarrier<PIPE_V>();
+            }
+            DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride);
+        }
+    }
+
+    /*!
+     * \brief Binary reduction between two UB buffers.
+     * \      Used for inter-core result merging when workspace staging is required.
+     * \param[in] ubTensorLeft, left operand (in-place result).
+     * \param[in] ubTensorRight, right operand (read-only).
+     * \param[in] calCount, number of elements to reduce.
+     */
+    __aicore__ inline void
+    ReduceBetweenUB(const AscendC::LocalTensor<PrompteDtype> &ubTensorLeft,
+                    const AscendC::LocalTensor<PrompteDtype> &ubTensorRight,
+                    const int32_t &calCount)
+    {
+        AscendC::Min(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
+    }
+
+    /*!
+     * \brief Return the value used for padding when UB alignment is required.
+     *        For MIN-reduction the neutral element is +∞ or 0.
+     * \tparam U, scalar type identical to DataType or PromoteDataType.
+     * \return The padding value (+∞ or 0).
+     */
+    template <typename U>
+    __aicore__ inline U GetPaddingValue()
+    {
+        // Due to the fact that ReduceMin accumulates R-axis data, the values of the supplemented elements
+        // are set to +∞ or 0 to ensure that the accumulated result is not affected
+        if(AscendC::IsSameType<U, int32_t>::value){
+            return INT32_MAX;
+        }else if(AscendC::IsSameType<U, uint32_t>::value){
+            return INT32_MAX - INT32_MIN;
+        }else{
+            return 1.0f / 0.0f;
+        }
+    }
+
+private:
+    __aicore__ inline void PerformInitialMin(const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum;
+        uint16_t addRTail = param.tailR - addRTotalNum;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
+                AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
+            }
+        } else {
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Min(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Min(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.mainR],
+                    srcTensor[addRTotalNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor<PrompteDtype> &srcTensor,
+        const ReduceARParam& param)
+    {
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) {
+                AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum);
+            }
+        } else {
+            uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum;
+            uint16_t addRTail = param.loopRNum - addRTotalNum;
+            // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+            AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Min(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Min(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.loopRNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+        const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        if constexpr (AscendC::IsSameType<PrompteDtype, float>::value ||
+                      AscendC::IsSameType<PrompteDtype, half>::value) {
+            uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceMin repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMin(
+                    dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        } else if constexpr (AscendC::IsSameType<PrompteDtype, int32_t>::value ||
+                             AscendC::IsSameType<PrompteDtype, uint32_t>::value) {
+            // Cast to float for higher-precision accumulation
+            AscendC::LocalTensor<float> interpreSrc = srcTensor.template ReinterpretCast<float>();
+            AscendC::LocalTensor<float> interpreDst = dstTensor.template ReinterpretCast<float>();
+            AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
+            AscendC::PipeBarrier<PIPE_V>();
+
+            uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceMin repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMin(
+                    interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+            AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
+        }
+    }
+};
+} // namespace ATVC
+
+#endif // ATVC_REDUCE_MIN_H
diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h
index 37a19be9..83bb2604 100644
--- a/atvc/include/reduce/reduce_op_template.h
+++ b/atvc/include/reduce/reduce_op_template.h
@@ -148,7 +148,9 @@ public:
     template <bool IsInput, bool needDup = true, bool temp = false, typename T>
     __aicore__ inline void AllocTensorAux(AscendC::LocalTensor<T>& tensor)
     {
-        bufPool_.AllocTensor<IsInput, needDup, temp>(tensor);
+        T DupValue = needDup ? compute_.template GetPaddingValue<T>() : 0;
+        bufPool_.AllocTensor<IsInput, needDup, temp>(tensor, DupValue);
+        // bufPool_.AllocTensor<IsInput, needDup>(tensor);
     }
 
     /*!
diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h
index b94ebad5..563ed5ec 100644
--- a/atvc/include/reduce/reduce_sum.h
+++ b/atvc/include/reduce/reduce_sum.h
@@ -14,20 +14,7 @@
 #include "common/kernel_utils.h"
 #include "reduce/common/patterns.h"
 #include "reduce/utils/reduce_block_aux_util.h"
-
-namespace {
-struct ReduceARParam {
-    uint32_t repStride = 0;
-    uint16_t dimA = 0;
-    uint16_t dimMax = 0;
-    uint16_t mainR = 0;
-    uint16_t tailR = 0;
-    uint64_t maskAddRNum = 0;
-    uint16_t loopRNum = 0;
-    uint16_t dtypeSize = 0;
-    uint16_t dimR = 0;
-};
-}
+#include "reduce/common/reduce_common.h"
 
 namespace ATVC {
 /*!
diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h
index 2fa2359c..1c209adc 100644
--- a/atvc/include/reduce/utils/reduce_block_aux.h
+++ b/atvc/include/reduce/utils/reduce_block_aux.h
@@ -235,7 +235,7 @@ public:
             computeTensor = ubTensor;
         } else {
             // The index of AlloccomputeTensorAux does not require external perception
-            op_->ReduceOp::template AllocTensorAux<false>(computeTensor);
+            op_->ReduceOp::template AllocTensorAux<false, false>(computeTensor);
             CopyIn<isPadding>(view, shape, ubTensor);
             SetEvent<AscendC::HardEvent::MTE2_V>(AscendC::HardEvent::MTE2_V);
             AscendC::Cast(computeTensor, ubTensor, AscendC::RoundMode::CAST_NONE, shape.value[0] * shape.value[1]);
@@ -247,14 +247,21 @@ public:
     __aicore__ inline void LinearComputeR(int64_t& tmpBufOffest, V& shape, Args... args)
     {
         SliceView<MAX_DIM> view;
+        bool needDup = false;
         for (int64_t i = 0; i < bisectionTail; i++) {
             AscendC::LocalTensor<InDtype> tensorLeft;
-            op_->ReduceOp::template AllocTensorAux<true>(tensorLeft);
+            op_->ReduceOp::template AllocTensorAux<true, false>(tensorLeft);
             AscendC::LocalTensor<PromoteDataType> computeLeft;
             PrePareReduce<(!InnerPattern::TailA), false>(i, view, shape, tensorLeft, computeLeft);
             
             AscendC::LocalTensor<InDtype> tensorRight;
-            op_->ReduceOp::template AllocTensorAux<true>(tensorRight);
+            needDup = i == bisectionTail - 1;
+            if(needDup){
+                op_->ReduceOp::template AllocTensorAux<true, true>(tensorRight);
+            }else{
+                op_->ReduceOp::template AllocTensorAux<true, false>(tensorRight);
+            }
+            // op_->ReduceOp::template AllocTensorAux<true>(tensorRight);
             AscendC::LocalTensor<PromoteDataType> computeRight;
             PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight);
             ComputeMerge(shape, computeLeft, computeRight, args...);
@@ -267,7 +274,13 @@ public:
 
         for (int64_t i = bisectionTail; i < bisectionPos; i++) {
             AscendC::LocalTensor<InDtype> tensor;
-            op_->ReduceOp::template AllocTensorAux<true>(tensor);
+            needDup = i == bisectionPos -1;
+            if(needDup){
+                op_->ReduceOp::template AllocTensorAux<true, true>(tensor);
+            }else{
+                op_->ReduceOp::template AllocTensorAux<true, false>(tensor);
+            }
+            // op_->ReduceOp::template AllocTensorAux<true>(tensor);
             AscendC::LocalTensor<PromoteDataType> computeLeft;
             PrePareReduce<(!InnerPattern::TailA && Pattern::Dim > 2), false>(i, view, shape, tensor, computeLeft);
             Compute(shape, computeLeft, args...);
diff --git a/atvc/include/reduce/utils/reduce_buf_pool.h b/atvc/include/reduce/utils/reduce_buf_pool.h
index 26d2ef4a..b980b1f7 100644
--- a/atvc/include/reduce/utils/reduce_buf_pool.h
+++ b/atvc/include/reduce/utils/reduce_buf_pool.h
@@ -62,7 +62,10 @@ public:
         // Init buffer
         GetTPipePtr()->InitBuffer(qQue_, poolSize);
         AscendC::LocalTensor<T> inputUb = qQue_.GetWithOffset<T>(basicNum_ * inputNum, 0);
-        AscendC::Duplicate<T>(inputUb, 0, basicNum_ * inputNum);
+        // AscendC::Duplicate<DataType>(inputUb, 0, basicNum_ * inputNum);
+        for(int16_t i =0;i<MAX_INPUT_SIZE;i++){
+            usedTBuf_[i] = false;
+        }
     }
 
     __aicore__ inline void ResetEvent()
@@ -83,7 +86,7 @@ public:
     }
 
     template <bool IsInput, bool needDup=true, bool temp=false, typename T>
-    __aicore__ inline const void AllocTensor(AscendC::LocalTensor<T>& tensor) {
+    __aicore__ inline const void AllocTensor(AscendC::LocalTensor<T>& tensor, T DupValue) {
         if constexpr (temp){
             if constexpr (IsInput) {
                 int32_t idx = GetPreTensorId();
@@ -96,11 +99,16 @@ public:
             int32_t idx = GetInputTensorId();
             tensor = qQue_.GetWithOffset<T>(basicNum_, inputUnit_.offset + idx * basicNum_ * sizeof(T));
             if constexpr (needDup) {
-                AscendC::Duplicate<T>(tensor, 0, basicNum_);
+                AscendC::PipeBarrier<PIPE_V>();
+                AscendC::Duplicate<T>(tensor, DupValue, basicNum_);
+            }
+            if(usedTBuf_[idx] || needDup){
                 event_t allocEventId = static_cast<event_t>(GetTPipePtr()->FetchEventID<AscendC::HardEvent::V_MTE2>());
                 eventIdV2Mte2_[idx] = allocEventId;
+                needWaitFlag_[idx] = true;
                 AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(allocEventId);
             }
+            usedTBuf_[idx] = true;
         } else {
             int32_t idx = GetComputeTensorId();
             tensor = qQue_.GetWithOffset<T>(basicNum_, computeUnit_.offset + idx * basicNum_ * sizeof(T));
@@ -119,7 +127,11 @@ public:
         uint64_t offset = (uint64_t)(tensor.GetPhyAddr());
         if (offset - start < computeUnit_.offset) {
             int32_t idx = (offset - start) / sizeof(T) / basicNum_;
-            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventIdV2Mte2_[idx]);
+            // AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventIdV2Mte2_[idx]);
+            if(needWaitFlag_[idx]){
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventIdV2Mte2_[idx]);
+                needWaitFlag_[idx] = false;
+            }
         }
     }
 
@@ -153,6 +165,8 @@ private:
     PoolManagerUnit computeUnit_;
     PoolManagerUnit postUnit_;
     event_t eventIdV2Mte2_[MAX_INPUT_SIZE];
+    bool needWaitFlag_[MAX_INPUT_SIZE];
+    bool usedTBuf_[MAX_INPUT_SIZE];
     AscendC::TBuf<> qQue_;
     int32_t basicNum_;
 };  // class ReduceBufPool
-- 
Gitee


From 7de9e60438ea8c22c98ae2633461bff38cbad777 Mon Sep 17 00:00:00 2001
From: SeaElm <yangruizhi5@huawei.com>
Date: Wed, 15 Oct 2025 07:46:50 +0000
Subject: [PATCH 2/6] !692 fix simpleSoftmax bug : error calculate splitK Merge
 pull request !692 from SeaElm/master

---
 .../common/simple_softmax_common_impl.h       | 30 ++++---------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/impl/activation/softmax/membase/common/simple_softmax_common_impl.h b/impl/activation/softmax/membase/common/simple_softmax_common_impl.h
index 3c7507a6..2d9b00fc 100644
--- a/impl/activation/softmax/membase/common/simple_softmax_common_impl.h
+++ b/impl/activation/softmax/membase/common/simple_softmax_common_impl.h
@@ -308,17 +308,11 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst,
         PipeBarrier<PIPE_V>();
         Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
     } else {
-        uint32_t splitK = 0;
-        if constexpr (config.oriSrcK % HALF_NUM_PER_BLK == 0) {
-            splitK = config.oriSrcK;
-        } else {
-            splitK = AlignUp(config.oriSrcK, HALF_NUM_PER_BLK);
-        }
         Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
         Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize);
         PipeBarrier<PIPE_V>();
 
-        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, splitK,
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.splitK,
             DEFAULT_REPEAT_STRIDE * HALF_FACTOR);
 
         PipeBarrier<PIPE_V>();
@@ -326,7 +320,7 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst,
 
         Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize);
         PipeBarrier<PIPE_V>();
-        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, splitK,
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.splitK,
             DEFAULT_REPEAT_STRIDE * HALF_FACTOR);
 
         PipeBarrier<PIPE_V>();
@@ -348,18 +342,12 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<float>& dst,
         PipeBarrier<PIPE_V>();
         GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
     } else {
-        uint32_t splitK = 0;
-        if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
-            splitK = config.oriSrcK;
-        } else {
-            splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
-        }
-        GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, splitK,
+        GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.splitK,
             DEFAULT_REPEAT_STRIDE);
         PipeBarrier<PIPE_V>();
         Exp(dst[offset1], dst[offset1], splitSize);
         PipeBarrier<PIPE_V>();
-        GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, splitK,
+        GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.splitK,
             DEFAULT_REPEAT_STRIDE);
     }
 }
@@ -461,20 +449,14 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst,
         PipeBarrier<PIPE_V>();
         Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
     } else {
-        uint32_t splitK = 0;
-        if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
-            splitK = config.oriSrcK;
-        } else {
-            splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
-        }
         Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
         PipeBarrier<PIPE_V>();
-        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, splitK,
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.splitK,
             DEFAULT_REPEAT_STRIDE);
         PipeBarrier<PIPE_V>();
         Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize);
         PipeBarrier<PIPE_V>();
-        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, splitK,
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.splitK,
             DEFAULT_REPEAT_STRIDE);
         PipeBarrier<PIPE_V>();
         Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
-- 
Gitee


From f2cd0201c582bb193e63e07eadc8c31d9f22cc4c Mon Sep 17 00:00:00 2001
From: Chen Ning <chenning54@huawei.com>
Date: Wed, 15 Oct 2025 08:57:16 +0000
Subject: [PATCH 3/6] =?UTF-8?q?!690=20add=20new=20namespace=20to=20"Ascend?=
 =?UTF-8?q?C::tiling"=20to=20adv=20tiling=20api=20and=20ensure=20th?=
 =?UTF-8?q?=E2=80=A6=20Merge=20pull=20request=20!690=20from=20Chen=20Ning/?=
 =?UTF-8?q?master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/scripts/gen_kernel_tiling_data_def.py | 56 +++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/cmake/scripts/gen_kernel_tiling_data_def.py b/cmake/scripts/gen_kernel_tiling_data_def.py
index 22394286..c8e08618 100644
--- a/cmake/scripts/gen_kernel_tiling_data_def.py
+++ b/cmake/scripts/gen_kernel_tiling_data_def.py
@@ -15,13 +15,52 @@ import os
 import re
 
 
+_NAMESPACE = "AscendC::tiling"
+_LEGACY_TILING_STRUCTS = [
+    "LogSoftMaxTiling",
+    "SoftMaxTiling",
+    "TConv3DApiTiling",
+    "TConv3DBpFilterTiling",
+    "Conv3DBpFilterParams",
+    "TConv3DBpFilterBasicBlockTiling",
+    "Conv3DBackpropFilterTilingData",
+    "TConv3DBackpropInputTiling",
+    "Conv3DBackpropInputTilingData",
+    "Mc2ServerCfg",
+    "Mc2HcommCfg",
+    "Mc2InitTiling",
+    "Mc2CcTiling",
+    "TCubeTiling",
+    "BatchNormTiling",
+    "DeepNormTiling",
+    "GroupNormTiling",
+    "LayerNormGradBetaTiling",
+    "LayerNormGradTiling",
+    "LayerNormTiling",
+    "LayerNormSeparateTiling",
+    "RmsNormTiling",
+    "UnPadTiling",
+    "PadTiling",
+    "TopkTiling",
+    "ConfusionTransposeTiling"
+]
+
+
 def gen_tiling(tiling_header_file):
     single_tiling_source = ""
+    single_legacy_tiling_export = ""
     if not os.path.exists(tiling_header_file):
         print("warning: no userdef tiling header file: ", tiling_header_file)
         return single_tiling_source
     print("generate tiling def header file: ", tiling_header_file)
     pattern = re.compile(r'[(](.*)[)]', re.S)
+
+    def parse_legacy_tiling(struct_def):
+        # export legacy tiling structs with 'using namespace' to ensure compatibility
+        nonlocal single_legacy_tiling_export
+        if struct_def in _LEGACY_TILING_STRUCTS:
+            single_legacy_tiling_export += f"using {_NAMESPACE}::{struct_def};\n"
+
     with open(tiling_header_file, 'r') as fd:
         lines = fd.readlines()
         for line in lines:
@@ -29,8 +68,9 @@ def gen_tiling(tiling_header_file):
             if (line.startswith('BEGIN_TILING_DATA_DEF')):
                 single_tiling_source += '#pragma pack(push, 8)\n'
                 single_tiling_source += 'struct '
-                struct_def  = re.findall(pattern, line)[0]
+                struct_def = re.findall(pattern, line)[0]
                 single_tiling_source += struct_def + ' {\n'
+                parse_legacy_tiling(struct_def)
             elif (line.startswith('TILING_DATA_FIELD_DEF_ARR')):
                 field_params = re.findall(pattern, line)[0]
                 fds = field_params.split(',')
@@ -46,7 +86,7 @@ def gen_tiling(tiling_header_file):
             elif (line.startswith('END_TILING_DATA_DEF')):
                 single_tiling_source += '};\n'
                 single_tiling_source += '#pragma pack(pop)\n'
-    return single_tiling_source
+    return single_tiling_source, single_legacy_tiling_export
 
 
 
@@ -62,6 +102,7 @@ if __name__ == '__main__':
 #endif
 
 """
+    res += "namespace AscendC {\nnamespace tiling {\n"
     print("[LOG]:  ", sys.argv[1], sys.argv[2], sys.argv[3])
     src_tiling_data_path = sys.argv[1]
     file_list = []
@@ -79,8 +120,17 @@ if __name__ == '__main__':
             if file.endswith("tilingdata.h") and file not in file_set:
                 file_list.append(os.path.join(root, file))
     file_list.sort()
+    
+    tiling_source = ""
+    legacy_tiling_export = ""
     for file in file_list:
-        res += gen_tiling(file)
+        src, exp = gen_tiling(file)
+        tiling_source += src
+        legacy_tiling_export += exp
+
+    res += tiling_source + "} // namespace tiling\n} // namespace AscendC\n\n"
+    res += legacy_tiling_export
+
     res += '#endif\n'
 
     generate_file = sys.argv[3]
-- 
Gitee


From 0d7757ea5b5b3baa43ac45a894b3cc1ab3663107 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=9F=E4=BF=8A=E6=88=90?= <jiangjuncheng2@huawei.com>
Date: Wed, 15 Oct 2025 09:35:05 +0000
Subject: [PATCH 4/6] =?UTF-8?q?!691=20update=20license=20Merge=20pull=20re?=
 =?UTF-8?q?quest=20!691=20from=20=E6=B1=9F=E4=BF=8A=E6=88=90/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 LICENSE | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/LICENSE b/LICENSE
index 72a0affe..b41cefbb 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
-CANN Open Software License Agreement Version 1.0
+CANN Open Software License Agreement Version 2.0
               
-This CANN Open Software License Agreement Version 1.0 (hereinafter referred to as this "Agreement") is a legal agreement between you and Huawei, and it governs your use, modification, or distribution of CANN Open Software (hereinafter referred to as "Software"). Please read this Agreement carefully. 
+This CANN Open Software License Agreement Version 2.0 (hereinafter referred to as this "Agreement") is a legal agreement between you and Huawei, and it governs your use, modification, or distribution of CANN Open Software (hereinafter referred to as "Software"). Please read this Agreement carefully.
 
 If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind that entity to this Agreement, in which case "you" will mean the entity you represent.
 
@@ -10,28 +10,27 @@ BY DOWNLOADING, INSTALLING, OR USING THE SOFTWARE, YOU AGREE YOU HAVE FULLY UNDE
 
 1.1   Software means the APIs, source code files, binaries, and related documents of Compute Architecture for Neural Networks("CANN") that are licensable by Huawei, and provided and licensed under this Agreement.
 
-1.2   Ascend processors means the chipsets branded with "Ascend" that are manufactured and supplied by Huawei.
+1.2   Huawei AI Processors mean AI chipsets (i) branded with "Ascend", "Kirin", "Yueying" or other brands owned or controlled by Huawei; or (ii) manufactured (including have manufactured), supplied (including have supplied) or designed (including have designed) by Huawei.
 
 2.  Grant of Intellectual Property Rights
-Subject to the terms and conditions of this Agreement, including your full compliance thereof, Huawei hereby grants you a limited, worldwide, royalty-free, non-transferable, non-sublicensable, and revocable license for you to (i) download, use, modify, integrate, and distribute the Software or its derivative works for the purpose of developing software solely for use with Ascend processors, and (ii) distribute the software developed under (i) solely for use with Ascend processors.
+2.1 Subject to the terms and conditions of this Agreement, including your full compliance thereof, Huawei hereby grants you a limited, worldwide, royalty-free, non-transferable, non-sublicensable, and revocable license for you to (i) download, use, modify, integrate, and distribute the Software or its derivative works for the purpose of developing software solely for use in systems with Huawei AI Processors and/or Software, and (ii) distribute any software developed based upon Software and/or its derivative works solely for use in systems with Huawei AI Processors and/or Software.
 
 3.  Restrictions
-3.1 You are not authorized to, and shall not use, modify, or distribute this Software or its derivative works for any purpose except those expressly permitted by this Agreement. You shall not make any use of the Software or its derivative works to develop or distribute software for use in systems with processors other than Ascend processors.
+3.1 You are not authorized to, and shall not use, modify, or distribute this Software or its derivative works for any other purposes than those expressly permitted by this Agreement. You shall not make any use of the Software or its derivative works to develop or distribute any software for use in systems with processors other than Huawei AI processors. All rights not expressly granted herein are expressly reserved by Huawei.
 
 3.2 You are not authorized to, and shall not remove, obscure, or alter any copyright or other notices in this Software or any part of it.
 
-3.3 Distribution Restrictions. 
-You may distribute the Software or its derivative works in any medium, whether in source or executable forms, provided that you comply with the purpose restriction stipulated in Section 2, provide recipients with a copy of this Agreement, and retain all notices in the Software.
+3.3 Distribution Restrictions
+You may distribute the Software or its derivative works in any medium, whether in source or executable forms, for the purpose stipulated in Section 2; provided that you provide recipients with a copy of this Agreement, and retain all notices in the Software.
 
 4. Disclaimer of Warranty and Limitation of Liability
-THE SOFTWARE IS PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL HUAWEI OR ANY OTHER COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE, IN WHOLE OR IN PART, NO MATTER HOW IT’S CAUSED OR THE LEGAL THEORY IT IS BASED ON, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+THE SOFTWARE IS PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL HUAWEI OR ANY OTHER COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE, IN WHOLE OR IN PART, NO MATTER HOW IT IS CAUSED OR THE LEGAL THEORY IT IS BASED ON, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 
 5. Termination
 5.1 This Agreement will continue to apply until terminated by either you or Huawei as described below:
 a．You may terminate this Agreement by ceasing your use of the Software;
 b.  Huawei may at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement; or (ii) you directly or indirectly initiate any legal proceeding against any individual or entity by alleging that the Software or any part of it infringes your intellectual property rights.
-
-5.2 By termination, all the rights granted to you under this Agreement are terminated, and you shall cease to use and delete this Software or its derivative works immediately.
+5.2 By termination, all the rights granted to you under this Agreement are terminated, and you shall cease to use and delete this Software or any derivative works immediately. All rights granted to you under this Agreement shall hereby be void ab initio in the event of termination in accordance with Section 5.1. b above. Huawei reserves the right to pursue any and all legal remedies available to enforce the terms and conditions of this Agreement or to protect Huawei’s intellectual property rights for such breach or violation. All provisions shall survive the termination of this Agreement except for Section 2 and Section 3.3.
 
 6. MISCELLANEOUS
 If the application of any provision of this Agreement to any particular facts or circumstances is held to be invalid or unenforceable by a court of competent jurisdiction, then (a) the validity and enforceability of such provision as applied to any other particular facts or circumstances and the validity of other provisions of this Agreement shall not in any way be affected or impaired thereby and (b) such provision shall be enforced to the maximum extent possible so as to affect the intent of the you and Huawei and reformed without further action by you and Huawei to the extent necessary to make such provision valid and enforceable.
-- 
Gitee


From 8da2db33b59521f788fe4937bbe51a17385fdd5c Mon Sep 17 00:00:00 2001
From: chen_zhoujie <chenzhoujie@huawei.com>
Date: Wed, 15 Oct 2025 09:44:40 +0000
Subject: [PATCH 5/6] =?UTF-8?q?!696=20=E5=88=A0=E9=99=A4TopK=E5=AF=B9?=
 =?UTF-8?q?=E4=BA=8Einner=E7=9A=84host=E6=A0=A1=E9=AA=8C=20Merge=20pull=20?=
 =?UTF-8?q?request=20!696=20from=20chen=5Fzhoujie/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 impl/sort/topk/topk_tiling_impl.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/impl/sort/topk/topk_tiling_impl.cpp b/impl/sort/topk/topk_tiling_impl.cpp
index 3f807cf2..9f936a6d 100644
--- a/impl/sort/topk/topk_tiling_impl.cpp
+++ b/impl/sort/topk/topk_tiling_impl.cpp
@@ -368,9 +368,6 @@ void CheckTopKHostCommon(const char *apiName, const char *hostFuncName,
     if (mode == TopKMode::TOPK_NSMALL) {
         ASCENDC_HOST_ASSERT(inner == 32, return, 
             "[%s][%s] In Small mode, the length of the inner axis must be 32!", apiName, hostFuncName);
-    } else if (mode == TopKMode::TOPK_NORMAL) {
-        ASCENDC_HOST_ASSERT(inner <= 4096, return, 
-            "[%s][%s] In Normal mode, the maximum length of the inner axis is 4096!", apiName, hostFuncName);
     }
     if (socVersion == platform_ascendc::SocVersion::ASCEND310P && isInitIndex == false) {
         ASCENDC_HOST_ASSERT(inner <= 2048, return, 
-- 
Gitee


From 39676f2a40213fb342f675e4cb2641be1a4c4882 Mon Sep 17 00:00:00 2001
From: Chen Ning <chenning54@huawei.com>
Date: Wed, 15 Oct 2025 08:57:16 +0000
Subject: [PATCH 6/6] =?UTF-8?q?!690=20add=20new=20namespace=20to=20"Ascend?=
 =?UTF-8?q?C::tiling"=20to=20adv=20tiling=20api=20and=20ensure=20th?=
 =?UTF-8?q?=E2=80=A6=20Merge=20pull=20request=20!690=20from=20Chen=20Ning/?=
 =?UTF-8?q?master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/scripts/gen_kernel_tiling_data_def.py   | 56 ++++++++++++++++++-
 .../gelu/test_operator_fast_gelu.cpp          |  2 +-
 .../gelu/test_operator_fast_gelu_v2.cpp       |  2 +-
 tests/activation/gelu/test_operator_gelu.cpp  |  2 +-
 tests/activation/silu/test_operator_silu.cpp  |  2 +-
 .../softmax/test_operator_softmax_v220.cpp    |  2 +-
 .../softmax/test_operator_softmax_v300.cpp    |  2 +-
 .../test_operator_softmaxflashv3_v220.cpp     |  2 +-
 .../activation/swish/test_operator_swish.cpp  |  2 +-
 .../filter/dropout/test_operator_dropout.cpp  |  2 +-
 .../test_operator_arithprogression.cpp        |  2 +-
 .../exp/test_operator_exphighprecision.cpp    |  2 +-
 .../batchnorm/test_operator_batchnorm.cpp     |  2 +-
 .../deepnorm/test_operator_deepnorm.cpp       |  2 +-
 .../groupnorm/test_operator_groupnorm.cpp     |  2 +-
 .../layernorm/test_operator_layernorm.cpp     |  2 +-
 .../layernorm/test_operator_layernormgrad.cpp |  2 +-
 .../test_operator_layernormgradbeta.cpp       |  2 +-
 .../layernormV2/test_operator_layernormV2.cpp |  2 +-
 .../normalize/test_operator_normalize.cpp     |  2 +-
 .../rmsnorm/test_operator_rmsnorm.cpp         |  2 +-
 .../test_operator_welfordfinalize.cpp         |  2 +-
 .../test_operator_welfordupdate.cpp           |  2 +-
 tests/pad/pad/test_operator_pad.cpp           |  2 +-
 .../antiquant/test_ascend_antiquant.cpp       |  2 +-
 .../test_ascend_antiquant_scalar.cpp          |  2 +-
 .../test_ascend_antiquant_weight.cpp          |  2 +-
 .../test_ascend_antiquant_weight_scalar.cpp   |  2 +-
 tests/reduce/mean/test_operator_mean.cpp      |  2 +-
 tests/reduce/sum/test_operator_sum.cpp        |  2 +-
 tests/sort/topk/test_operator_topk.cpp        |  2 +-
 .../test_operator_confusion_transpose.cpp     |  2 +-
 32 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/cmake/scripts/gen_kernel_tiling_data_def.py b/cmake/scripts/gen_kernel_tiling_data_def.py
index 22394286..c8e08618 100644
--- a/cmake/scripts/gen_kernel_tiling_data_def.py
+++ b/cmake/scripts/gen_kernel_tiling_data_def.py
@@ -15,13 +15,52 @@ import os
 import re
 
 
+_NAMESPACE = "AscendC::tiling"
+_LEGACY_TILING_STRUCTS = [
+    "LogSoftMaxTiling",
+    "SoftMaxTiling",
+    "TConv3DApiTiling",
+    "TConv3DBpFilterTiling",
+    "Conv3DBpFilterParams",
+    "TConv3DBpFilterBasicBlockTiling",
+    "Conv3DBackpropFilterTilingData",
+    "TConv3DBackpropInputTiling",
+    "Conv3DBackpropInputTilingData",
+    "Mc2ServerCfg",
+    "Mc2HcommCfg",
+    "Mc2InitTiling",
+    "Mc2CcTiling",
+    "TCubeTiling",
+    "BatchNormTiling",
+    "DeepNormTiling",
+    "GroupNormTiling",
+    "LayerNormGradBetaTiling",
+    "LayerNormGradTiling",
+    "LayerNormTiling",
+    "LayerNormSeparateTiling",
+    "RmsNormTiling",
+    "UnPadTiling",
+    "PadTiling",
+    "TopkTiling",
+    "ConfusionTransposeTiling"
+]
+
+
 def gen_tiling(tiling_header_file):
     single_tiling_source = ""
+    single_legacy_tiling_export = ""
     if not os.path.exists(tiling_header_file):
         print("warning: no userdef tiling header file: ", tiling_header_file)
         return single_tiling_source
     print("generate tiling def header file: ", tiling_header_file)
     pattern = re.compile(r'[(](.*)[)]', re.S)
+
+    def parse_legacy_tiling(struct_def):
+        # export legacy tiling structs with 'using namespace' to ensure compatibility
+        nonlocal single_legacy_tiling_export
+        if struct_def in _LEGACY_TILING_STRUCTS:
+            single_legacy_tiling_export += f"using {_NAMESPACE}::{struct_def};\n"
+
     with open(tiling_header_file, 'r') as fd:
         lines = fd.readlines()
         for line in lines:
@@ -29,8 +68,9 @@ def gen_tiling(tiling_header_file):
             if (line.startswith('BEGIN_TILING_DATA_DEF')):
                 single_tiling_source += '#pragma pack(push, 8)\n'
                 single_tiling_source += 'struct '
-                struct_def  = re.findall(pattern, line)[0]
+                struct_def = re.findall(pattern, line)[0]
                 single_tiling_source += struct_def + ' {\n'
+                parse_legacy_tiling(struct_def)
             elif (line.startswith('TILING_DATA_FIELD_DEF_ARR')):
                 field_params = re.findall(pattern, line)[0]
                 fds = field_params.split(',')
@@ -46,7 +86,7 @@ def gen_tiling(tiling_header_file):
             elif (line.startswith('END_TILING_DATA_DEF')):
                 single_tiling_source += '};\n'
                 single_tiling_source += '#pragma pack(pop)\n'
-    return single_tiling_source
+    return single_tiling_source, single_legacy_tiling_export
 
 
 
@@ -62,6 +102,7 @@ if __name__ == '__main__':
 #endif
 
 """
+    res += "namespace AscendC {\nnamespace tiling {\n"
     print("[LOG]:  ", sys.argv[1], sys.argv[2], sys.argv[3])
     src_tiling_data_path = sys.argv[1]
     file_list = []
@@ -79,8 +120,17 @@ if __name__ == '__main__':
             if file.endswith("tilingdata.h") and file not in file_set:
                 file_list.append(os.path.join(root, file))
     file_list.sort()
+    
+    tiling_source = ""
+    legacy_tiling_export = ""
     for file in file_list:
-        res += gen_tiling(file)
+        src, exp = gen_tiling(file)
+        tiling_source += src
+        legacy_tiling_export += exp
+
+    res += tiling_source + "} // namespace tiling\n} // namespace AscendC\n\n"
+    res += legacy_tiling_export
+
     res += '#endif\n'
 
     generate_file = sys.argv[3]
diff --git a/tests/activation/gelu/test_operator_fast_gelu.cpp b/tests/activation/gelu/test_operator_fast_gelu.cpp
index ec97dba5..d15dcfb2 100644
--- a/tests/activation/gelu/test_operator_fast_gelu.cpp
+++ b/tests/activation/gelu/test_operator_fast_gelu.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp
index f4d98b55..6974cdd1 100644
--- a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp
+++ b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/activation/gelu/test_operator_gelu.cpp b/tests/activation/gelu/test_operator_gelu.cpp
index 387727a0..c668d659 100644
--- a/tests/activation/gelu/test_operator_gelu.cpp
+++ b/tests/activation/gelu/test_operator_gelu.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/activation/silu/test_operator_silu.cpp b/tests/activation/silu/test_operator_silu.cpp
index a001b04e..ad7eeb15 100644
--- a/tests/activation/silu/test_operator_silu.cpp
+++ b/tests/activation/silu/test_operator_silu.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp
index 1261bd04..c7eacc53 100644
--- a/tests/activation/softmax/test_operator_softmax_v220.cpp
+++ b/tests/activation/softmax/test_operator_softmax_v220.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 #include <math.h>
diff --git a/tests/activation/softmax/test_operator_softmax_v300.cpp b/tests/activation/softmax/test_operator_softmax_v300.cpp
index d154acdf..c8e2e781 100644
--- a/tests/activation/softmax/test_operator_softmax_v300.cpp
+++ b/tests/activation/softmax/test_operator_softmax_v300.cpp
@@ -3,7 +3,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 #include <math.h>
diff --git a/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp b/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp
index c93705e4..84f0888f 100644
--- a/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp
+++ b/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 #include <math.h>
diff --git a/tests/activation/swish/test_operator_swish.cpp b/tests/activation/swish/test_operator_swish.cpp
index 9e9103f3..e37a5b66 100644
--- a/tests/activation/swish/test_operator_swish.cpp
+++ b/tests/activation/swish/test_operator_swish.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/filter/dropout/test_operator_dropout.cpp b/tests/filter/dropout/test_operator_dropout.cpp
index 7e7b9f41..32dd61b4 100644
--- a/tests/filter/dropout/test_operator_dropout.cpp
+++ b/tests/filter/dropout/test_operator_dropout.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/index/arithprogression/test_operator_arithprogression.cpp b/tests/index/arithprogression/test_operator_arithprogression.cpp
index ca52d8a2..9ac8d53f 100644
--- a/tests/index/arithprogression/test_operator_arithprogression.cpp
+++ b/tests/index/arithprogression/test_operator_arithprogression.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/math/exp/test_operator_exphighprecision.cpp b/tests/math/exp/test_operator_exphighprecision.cpp
index 882199b2..e0bbe7a5 100644
--- a/tests/math/exp/test_operator_exphighprecision.cpp
+++ b/tests/math/exp/test_operator_exphighprecision.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 
diff --git a/tests/normalization/batchnorm/test_operator_batchnorm.cpp b/tests/normalization/batchnorm/test_operator_batchnorm.cpp
index e255c926..facee7f5 100644
--- a/tests/normalization/batchnorm/test_operator_batchnorm.cpp
+++ b/tests/normalization/batchnorm/test_operator_batchnorm.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/deepnorm/test_operator_deepnorm.cpp b/tests/normalization/deepnorm/test_operator_deepnorm.cpp
index bbc72b7f..c6c1e943 100644
--- a/tests/normalization/deepnorm/test_operator_deepnorm.cpp
+++ b/tests/normalization/deepnorm/test_operator_deepnorm.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 
diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp
index 7dd522b9..71f98669 100644
--- a/tests/normalization/groupnorm/test_operator_groupnorm.cpp
+++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp
@@ -15,7 +15,7 @@
 
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/layernorm/test_operator_layernorm.cpp b/tests/normalization/layernorm/test_operator_layernorm.cpp
index 5875945b..a3bc7836 100644
--- a/tests/normalization/layernorm/test_operator_layernorm.cpp
+++ b/tests/normalization/layernorm/test_operator_layernorm.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/layernorm/test_operator_layernormgrad.cpp b/tests/normalization/layernorm/test_operator_layernormgrad.cpp
index fab91ded..81765f38 100644
--- a/tests/normalization/layernorm/test_operator_layernormgrad.cpp
+++ b/tests/normalization/layernorm/test_operator_layernormgrad.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp b/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp
index 21dd3578..30d827cb 100644
--- a/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp
+++ b/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/layernormV2/test_operator_layernormV2.cpp b/tests/normalization/layernormV2/test_operator_layernormV2.cpp
index 0af89bd9..fe25a875 100644
--- a/tests/normalization/layernormV2/test_operator_layernormV2.cpp
+++ b/tests/normalization/layernormV2/test_operator_layernormV2.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/normalize/test_operator_normalize.cpp b/tests/normalization/normalize/test_operator_normalize.cpp
index fd975d90..17751b1d 100644
--- a/tests/normalization/normalize/test_operator_normalize.cpp
+++ b/tests/normalization/normalize/test_operator_normalize.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp b/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp
index 42d7072f..6cf9fb73 100644
--- a/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp
+++ b/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 
 using namespace std;
diff --git a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp
index c5b08ec1..1411599d 100644
--- a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp
+++ b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include <iostream>
 using namespace std;
diff --git a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp
index 654a40a6..8833226c 100644
--- a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp
+++ b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include <iostream>
 using namespace std;
diff --git a/tests/pad/pad/test_operator_pad.cpp b/tests/pad/pad/test_operator_pad.cpp
index 8bed6873..f7e4e51c 100644
--- a/tests/pad/pad/test_operator_pad.cpp
+++ b/tests/pad/pad/test_operator_pad.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 #include <math.h>
diff --git a/tests/quantization/antiquant/test_ascend_antiquant.cpp b/tests/quantization/antiquant/test_ascend_antiquant.cpp
index 105d9151..af9b3a6c 100644
--- a/tests/quantization/antiquant/test_ascend_antiquant.cpp
+++ b/tests/quantization/antiquant/test_ascend_antiquant.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 
diff --git a/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp b/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp
index f631cf0e..1128ed3d 100644
--- a/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp
+++ b/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 
diff --git a/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp b/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp
index d66a2373..1ebd4471 100644
--- a/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp
+++ b/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_operator_intf.h"
 #include "kernel_utils.h"
diff --git a/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp b/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp
index 571c86ed..d4641a89 100644
--- a/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp
+++ b/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 
diff --git a/tests/reduce/mean/test_operator_mean.cpp b/tests/reduce/mean/test_operator_mean.cpp
index eb96f184..cf457c2f 100644
--- a/tests/reduce/mean/test_operator_mean.cpp
+++ b/tests/reduce/mean/test_operator_mean.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include <iostream>
 
diff --git a/tests/reduce/sum/test_operator_sum.cpp b/tests/reduce/sum/test_operator_sum.cpp
index 015bcf70..6ca21916 100644
--- a/tests/reduce/sum/test_operator_sum.cpp
+++ b/tests/reduce/sum/test_operator_sum.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 #include <math.h>
diff --git a/tests/sort/topk/test_operator_topk.cpp b/tests/sort/topk/test_operator_topk.cpp
index a1f80967..ab8a14f6 100644
--- a/tests/sort/topk/test_operator_topk.cpp
+++ b/tests/sort/topk/test_operator_topk.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include <iostream>
 using namespace std;
diff --git a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp
index c0624cc1..f3aa42cd 100644
--- a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp
+++ b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp
@@ -9,7 +9,7 @@
  */
 #include <gtest/gtest.h>
 #define private public
-#define protect public
+#define protected public
 #include "kernel_operator.h"
 #include "kernel_utils.h"
 #include <math.h>
-- 
Gitee


算子类型(OpType)	ReduceMax
算子输入
	name	shape	data type	format
	x	8 * 2048	float	ND

算子输出
算子输出	y	1 * 2048	float	ND
核函数名	ReduceCustom
算子类型(OpType)	ReduceMin
算子输入
	name	shape	data type	format
	x	8 * 2048	float	ND

算子输出
算子输出	y	1 * 2048	float	ND
核函数名	ReduceCustom