From 9a56f2033d4891c938f444a88f6c686fa517078a Mon Sep 17 00:00:00 2001
From: chen_g_tian <1581528722@qq.com>
Date: Fri, 29 Aug 2025 17:01:07 +0800
Subject: [PATCH] =?UTF-8?q?Extend=20atvc=20reduce=20template=EF=BC=9Areduc?=
 =?UTF-8?q?e=5Fmax=20&=20reduce=5Fmin?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 atvc/examples/reduce_max/README.md           |  46 +++
 atvc/examples/reduce_max/reduce_max.cpp      | 173 ++++++++++
 atvc/examples/reduce_min/README.md           |  46 +++
 atvc/examples/reduce_min/reduce_min.cpp      | 173 ++++++++++
 atvc/include/atvc.h                          |   2 +
 atvc/include/reduce/common/reduce_common.h   |  12 +
 atvc/include/reduce/reduce_device.h          |   2 +
 atvc/include/reduce/reduce_max.h             | 341 +++++++++++++++++++
 atvc/include/reduce/reduce_min.h             | 341 +++++++++++++++++++
 atvc/include/reduce/reduce_op_template.h     |   4 +-
 atvc/include/reduce/reduce_sum.h             |  15 +-
 atvc/include/reduce/utils/reduce_block_aux.h |  21 +-
 atvc/include/reduce/utils/reduce_buf_pool.h  |  22 +-
 13 files changed, 1175 insertions(+), 23 deletions(-)
 create mode 100644 atvc/examples/reduce_max/README.md
 create mode 100644 atvc/examples/reduce_max/reduce_max.cpp
 create mode 100644 atvc/examples/reduce_min/README.md
 create mode 100644 atvc/examples/reduce_min/reduce_min.cpp
 create mode 100644 atvc/include/reduce/reduce_max.h
 create mode 100644 atvc/include/reduce/reduce_min.h
diff --git a/atvc/examples/reduce_max/README.md b/atvc/examples/reduce_max/README.md
new file mode 100644
index 00000000..ebfbdd8f
--- /dev/null
+++ b/atvc/examples/reduce_max/README.md
@@ -0,0 +1,46 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了利用ATVC实现ReduceMax单算子并完成功能验证
+
+
+## 样例支持产品型号：
+- Atlas A2训练系列产品
+
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [reduce_max.cpp](./reduce_max.cpp) | ReduceSum算子代码实现以及调用样例               |
+
+## 算子描述
+
+ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
+ReduceSum算子规格：
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ReduceSum</td></tr>
+
+<tr><td rowspan="4" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr></tr>
+
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">ReduceCustom</td></tr>
+</table>
+
+## 算子运行
+在ascendc-api-adv代码仓目录下执行：
+```bash
+$ cd ./atvc/examples
+$ bash run_examples.sh reduce_max
+...
+Generate golden data successfully.
+...
+Accuracy verification passed.
+```
\ No newline at end of file
diff --git a/atvc/examples/reduce_max/reduce_max.cpp b/atvc/examples/reduce_max/reduce_max.cpp
new file mode 100644
index 00000000..0b64bfdd
--- /dev/null
+++ b/atvc/examples/reduce_max/reduce_max.cpp
@@ -0,0 +1,173 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <random>
+#include <iostream>
+#include <algorithm>
+#include "acl/acl.h"
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
+#include "example_common.h"
+
+namespace {
+// ReduceSum算子的描述：一个输入，一个输出，类型均为float
+using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+
+void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
+{
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+}
+}
+
+/*
+ * 该函数为ReduceCustom算子核函数入口
+ * x                 Device上的gm地址，指向Add算子第一个输入
+ * y                 Device上的gm地址，指向Add算子第一个输出
+ * reduceParam       指向运行态ATVC::ReduceParam数据
+*/
+template<typename Traits, const auto& Policy>
+__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceMaxCompute<Traits>, Policy>();
+    op.Run(x, y, &reduceParam);
+}
+namespace {
+// 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
+template<class OpTraits>
+void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam &param, ATVC::ReducePolicy &policy, aclrtStream& stream)
+{
+    // 申请临时空间workspace
+    uint8_t *workspaceDevice;
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    param.workspaceAddr = reinterpret_cast<uint64_t>(workspaceDevice);
+    // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+    if (policy == ATVC::REDUCE_POLICY0) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY1) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY1><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY2) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY2><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY3) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY3><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY4) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY4><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY5) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY5><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY6) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY6><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY7) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY7><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY8) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY8><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY9) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY9><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY10) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY10><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY11) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY11><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY12) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY12><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY13) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY13><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY14) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY14><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY15) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY15><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY16) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY16><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY17) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY17><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY18) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY18><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY19) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY19><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY20) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY20><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY21) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY21><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY22) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY22><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else {
+        printf("[ERROR]: Cannot find any matched policy.\n");
+    }
+    // 流同步后释放申请的param内存
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+}
+}
+
+int32_t main(int32_t argc, char* argv[])
+{
+    if (!ATVC::Host::DebugCheck<ReduceOpTraits, ATVC::TemplateType::REDUCE>()) {
+        printf("[ERROR]: Reduce OpTraits check failed.\n");
+        return -1;
+    }
+    int32_t eleNum = 8 * 1024;
+    int32_t outEleNum = 1 * 1024;
+    size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
+    size_t outputByteSize = static_cast<size_t>(outEleNum) * sizeof(float);
+    std::vector<int64_t> dim{0};            // 对第0轴执行reduce操作
+    std::vector<int64_t> shape{8, 1024};    // 测试输入shape
+    std::vector<float> inputX(eleNum, 1.0f);
+    std::vector<float> golden(outEleNum, 1.0f);
+    printf("Generate golden data successfully.\n");
+    // 初始化Acl资源
+    CHECK_ACL(aclInit(nullptr));
+    aclrtContext context;
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+    uint8_t *yHost;
+    uint8_t *xDevice;
+    uint8_t *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    ATVC::Host::ReduceTilingHyperParam hyperParam;
+    hyperParam.maxInnerA = 256;// 设置maxInnerA为256
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param, hyperParam=hyperParam)) {
+        printf("Reduce tiling error.\n");
+        return -1;
+    };
+
+    // 调用Adapter调度接口，完成核函数的模板调用
+    ReduceOpAdapter<ReduceOpTraits>(xDevice, yDevice, param, policy, stream);
+
+    CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    std::vector<float> outputY(reinterpret_cast<float*>(yHost), reinterpret_cast<float*>(yHost) + outEleNum);
+
+    // 释放Acl资源
+    CleanUp(xDevice, yDevice, yHost);
+    CleanACL(stream, context, deviceId);
+
+    if (!VerifyResults(golden, outputY)) {
+        return -1;
+    }
+
+    printf("Accuracy verification passed.\n");
+    return 0;
+}
diff --git a/atvc/examples/reduce_min/README.md b/atvc/examples/reduce_min/README.md
new file mode 100644
index 00000000..d4af048d
--- /dev/null
+++ b/atvc/examples/reduce_min/README.md
@@ -0,0 +1,46 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了利用ATVC实现ReduceMin单算子并完成功能验证
+
+
+## 样例支持产品型号：
+- Atlas A2训练系列产品
+
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [reduce_min.cpp](./reduce_min.cpp) | ReduceSum算子代码实现以及调用样例               |
+
+## 算子描述
+
+ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
+ReduceSum算子规格：
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ReduceSum</td></tr>
+
+<tr><td rowspan="4" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr></tr>
+
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">ReduceCustom</td></tr>
+</table>
+
+## 算子运行
+在ascendc-api-adv代码仓目录下执行：
+```bash
+$ cd ./atvc/examples
+$ bash run_examples.sh reduce_min
+...
+Generate golden data successfully.
+...
+Accuracy verification passed.
+```
\ No newline at end of file
diff --git a/atvc/examples/reduce_min/reduce_min.cpp b/atvc/examples/reduce_min/reduce_min.cpp
new file mode 100644
index 00000000..6fd71ca4
--- /dev/null
+++ b/atvc/examples/reduce_min/reduce_min.cpp
@@ -0,0 +1,173 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <random>
+#include <iostream>
+#include <algorithm>
+#include "acl/acl.h"
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
+#include "example_common.h"
+
+namespace {
+// ReduceSum算子的描述：一个输入，一个输出，类型均为float
+using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+
+void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
+{
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+}
+}
+
+/*
+ * 该函数为ReduceCustom算子核函数入口
+ * x                 Device上的gm地址，指向Add算子第一个输入
+ * y                 Device上的gm地址，指向Add算子第一个输出
+ * reduceParam       指向运行态ATVC::ReduceParam数据
+*/
+template<typename Traits, const auto& Policy>
+__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceMinCompute<Traits>, Policy>();
+    op.Run(x, y, &reduceParam);
+}
+namespace {
+// 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
+template<class OpTraits>
+void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam &param, ATVC::ReducePolicy &policy, aclrtStream& stream)
+{
+    // 申请临时空间workspace
+    uint8_t *workspaceDevice;
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    param.workspaceAddr = reinterpret_cast<uint64_t>(workspaceDevice);
+    // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+    if (policy == ATVC::REDUCE_POLICY0) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY1) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY1><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY2) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY2><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY3) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY3><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY4) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY4><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY5) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY5><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY6) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY6><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY7) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY7><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY8) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY8><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY9) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY9><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY10) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY10><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY11) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY11><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY12) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY12><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY13) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY13><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY14) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY14><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY15) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY15><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY16) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY16><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY17) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY17><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY18) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY18><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY19) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY19><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY20) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY20><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY21) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY21><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY22) {
+        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY22><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else {
+        printf("[ERROR]: Cannot find any matched policy.\n");
+    }
+    // 流同步后释放申请的param内存
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+}
+}
+
+int32_t main(int32_t argc, char* argv[])
+{
+    if (!ATVC::Host::DebugCheck<ReduceOpTraits, ATVC::TemplateType::REDUCE>()) {
+        printf("[ERROR]: Reduce OpTraits check failed.\n");
+        return -1;
+    }
+    int32_t eleNum = 8 * 1024;
+    int32_t outEleNum = 1 * 1024;
+    size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
+    size_t outputByteSize = static_cast<size_t>(outEleNum) * sizeof(float);
+    std::vector<int64_t> dim{0};            // 对第0轴执行reduce操作
+    std::vector<int64_t> shape{8, 1024};    // 测试输入shape
+    std::vector<float> inputX(eleNum, 1.0f);
+    std::vector<float> golden(outEleNum, 1.0f);
+    printf("Generate golden data successfully.\n");
+    // 初始化Acl资源
+    CHECK_ACL(aclInit(nullptr));
+    aclrtContext context;
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+    uint8_t *yHost;
+    uint8_t *xDevice;
+    uint8_t *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    ATVC::Host::ReduceTilingHyperParam hyperParam;
+    hyperParam.maxInnerA = 256;// 设置maxInnerA为256
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param, hyperParam=hyperParam)) {
+        printf("Reduce tiling error.\n");
+        return -1;
+    };
+
+    // 调用Adapter调度接口，完成核函数的模板调用
+    ReduceOpAdapter<ReduceOpTraits>(xDevice, yDevice, param, policy, stream);
+
+    CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    std::vector<float> outputY(reinterpret_cast<float*>(yHost), reinterpret_cast<float*>(yHost) + outEleNum);
+
+    // 释放Acl资源
+    CleanUp(xDevice, yDevice, yHost);
+    CleanACL(stream, context, deviceId);
+
+    if (!VerifyResults(golden, outputY)) {
+        return -1;
+    }
+
+    printf("Accuracy verification passed.\n");
+    return 0;
+}
diff --git a/atvc/include/atvc.h b/atvc/include/atvc.h
index 37b9c555..6073408d 100644
--- a/atvc/include/atvc.h
+++ b/atvc/include/atvc.h
@@ -28,6 +28,8 @@
 #include "common/kernel_utils.h"
 #include "elewise/elewise_op_template.h"
 #include "reduce/reduce_sum.h"
+#include "reduce/reduce_max.h"
+#include "reduce/reduce_min.h"
 #include "reduce/reduce_op_template.h"
 #include "broadcast/broadcast_compute.h"
 #include "broadcast/broadcast_op_template.h"
diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h
index 480c04d2..d5722402 100644
--- a/atvc/include/reduce/common/reduce_common.h
+++ b/atvc/include/reduce/common/reduce_common.h
@@ -29,6 +29,18 @@ enum ShapeDim {
     DIM_BROADCAST   // Broadcast axis
 };
 
+struct ReduceARParam {
+    uint32_t repStride = 0;
+    uint16_t dimA = 0;
+    uint16_t dimMax = 0;
+    uint16_t mainR = 0;
+    uint16_t tailR = 0;
+    uint64_t maskAddRNum = 0;
+    uint16_t loopRNum = 0;
+    uint16_t dtypeSize = 0;
+    uint16_t dimR = 0;
+};
+
 namespace AR_PATTERN {
     static constexpr uint32_t A = 100;
     static constexpr uint32_t AR = 11;
diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h
index a271e8f1..c1611078 100644
--- a/atvc/include/reduce/reduce_device.h
+++ b/atvc/include/reduce/reduce_device.h
@@ -25,6 +25,8 @@
 
 #include "common/kernel_utils.h"
 #include "reduce/reduce_sum.h"
+#include "reduce/reduce_max.h"
+#include "reduce/reduce_min.h"
 #include "reduce/reduce_op_template.h"
 
 #endif // ATVC_REDUCE_DEVICE_H
\ No newline at end of file
diff --git a/atvc/include/reduce/reduce_max.h b/atvc/include/reduce/reduce_max.h
new file mode 100644
index 00000000..906f9c08
--- /dev/null
+++ b/atvc/include/reduce/reduce_max.h
@@ -0,0 +1,341 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATVC_REDUCE_MAX_H
+#define ATVC_REDUCE_MAX_H
+
+#include "common/kernel_utils.h"
+#include "reduce/common/patterns.h"
+#include "reduce/utils/reduce_block_aux_util.h"
+#include "reduce/common/reduce_common.h"
+
+namespace ATVC {
+/*!
+ * ReduceSumCompute This class provides the core arithmetic required to reduce
+ * tensors along either the inner-most (AR) or outer-most (RA) axis after
+ * the tensor has been copied to the Unified Buffer (UB).  Data movement between
+ * Global Memory (GM) and UB is not handled here; it is the responsibility of
+ * the surrounding scheduling template.
+ */
+template <typename OpTraits>
+class ReduceMaxCompute {
+public:
+    // 从OpTraits中萃取算子输入描述信息
+    using inputDTypeList = typename OpTraits::In::types;
+    using DataType = typename ATVC::TypeListGet<inputDTypeList, 0>::Type;
+    using PrompteDtype = typename KernelUtils::GetPromoteType<DataType>::T;
+    __aicore__ inline ReduceMaxCompute() {}
+
+    /*!
+     * \brief Perform the actual reduction on a tile already resident in UB.
+     * \tparam needMask, True when UB alignment introduced invalid lanes.
+     * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA.
+     * \param[in] shape, {dimA, dimR} in elements; dimR may be padded.
+     * \param[out] dst, Destination tensor (length == dimA)
+     * \param[in] src, Source tensor (length == dimA * dimR)
+     */
+    template <bool needMask, class Pattern>
+    __aicore__ inline void
+    Compute(KernelUtils::Shape<2> &shape,
+            const AscendC::LocalTensor<PrompteDtype> &dst,
+            const AscendC::LocalTensor<PrompteDtype> &src)
+    {
+        // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available:
+        // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative
+        //    calculations can only calculate the actual number of effective elements
+        // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface
+        //    implemented by the user)
+        if (std::is_same<Pattern, ReducePattern::AR>::value) {
+            if constexpr (needMask) { // 1. High performance alignment mode
+                // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary
+                // length (number of elements), where dimR is the original number of elements
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.oriBurstLen);
+            } else {
+                // MainR: The framework provides the calculation of the R-axis binary length (number of elements),
+                // where dimR is the number of elements after completion
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.value[1]);
+            }
+        }
+        if (std::is_same<Pattern, ReducePattern::RA>::value) {
+            int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false);
+            ReduceRA(dst, src, shape.value[1], shape.value[0], mainR);
+        }
+    }
+
+    /*!
+     * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis.
+     * \param[out] dst, Output tensor (length == dimA)
+     * \param[in] src, Input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, Length of the non-reduced axis (A)
+     * \param[in] dimR, Length of the reduced axis (R)
+     * \param[in] mainR, Largest power-of-two ≤ dimR (computed by the caller)
+     */
+    __aicore__ inline void
+    ReduceRA(const AscendC::LocalTensor<PrompteDtype> &dst,
+             const AscendC::LocalTensor<PrompteDtype> &src, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR)
+    {
+        uint32_t totalNum = dimR * dimA;
+        uint32_t mainNum = dimA * mainR;
+        constexpr uint32_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t tailNum = totalNum - mainNum;
+        // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes
+        constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
+        uint16_t repeatTimes = tailNum / maskAddNum;
+        uint16_t repeatNum = repeatTimes * maskAddNum;
+        uint16_t repTailNum = tailNum - repeatNum;
+        // Same data block step size between different iterations
+        uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
+        if (repeatTimes > 0) {
+            AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+        }
+        if (repTailNum > 0) {
+            // Same data block step size between different iterations
+            repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+            repeatParams.dstRepStride = repStride;
+            repeatParams.src0RepStride = repStride;
+            repeatParams.src1RepStride = repStride;
+            AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+        uint16_t loopRNum = mainR;
+        while (loopRNum > 1) {
+            loopRNum = loopRNum >> 1;
+            mainNum = loopRNum * dimA; // The first half of LoopR's data volume
+            repeatTimes = mainNum / maskAddNum;
+            repeatNum = repeatTimes * maskAddNum;
+            repTailNum = mainNum - repeatNum;
+            if (repeatTimes > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+            }
+            if (repTailNum > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        }
+        AscendC::DataCopy(dst, src, dimA);
+    }
+
+    /*!
+     * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis.
+     * \param[out] dstTensor, Output tensor (length == dimA)
+     * \param[in] srcTensor, Input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, Length of the non-reduced axis (A)
+     * \param[in] dimR, Padded length of the reduced axis (R)
+     * \param[in] mainR, Largest power-of-two ≤ original R length
+     * \param[in] oriBurstLen, Original (un-padded) R length used to compute tail
+     */
+    __aicore__ inline void
+    ReduceAR(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+             const AscendC::LocalTensor<PrompteDtype> &srcTensor, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen)
+    {
+        uint16_t tailR = oriBurstLen - mainR;
+        constexpr uint16_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32;
+        uint16_t dimMax = dimA * dimR;
+        constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize;
+
+        ReduceARParam param{
+            .repStride   = repStride,
+            .dimA        = dimA,
+            .dimMax      = dimMax,
+            .mainR       = mainR,
+            .tailR       = tailR,
+            .maskAddRNum = maskAddRNum,
+            .dtypeSize   = dtypeSize,
+            .dimR        = dimR
+        };
+
+        if (mainR > 0 && tailR > 0) {
+            PerformInitialMax(srcTensor, param);
+        }
+
+        param.loopRNum = mainR;
+        while (param.loopRNum > maskAddRNum) {
+            param.loopRNum = param.loopRNum / 2U;
+            PerformBinaryReduction(srcTensor, param);
+        }
+        if (param.loopRNum == 0) { // small shape, directly reduce
+            param.loopRNum = tailR;
+        }
+        PerformFinalReduction(dstTensor, srcTensor, param);
+    }
+
+    template <class Pattern, class V>
+    __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor<PrompteDtype>& tempBuf,
+                                       const AscendC::LocalTensor<PrompteDtype>& computeRes)
+    {
+        int64_t cacheID = KernelUtils::Reduce::GetCacheID(index);
+        int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0];
+        int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype);
+        int64_t stride = OpsUtils::CeilDiv(dimA, static_cast<int64_t>(element_one_repeat)) * element_one_repeat;
+        uint16_t outerLoopTimes = OpsUtils::CeilDiv(
+            static_cast<int64_t>(dimA * sizeof(PrompteDtype)), static_cast<int64_t>(Platform::GetVRegSize()));
+        uint16_t innerLoopTimes = cacheID;
+        uint32_t outerLoopStride = element_one_repeat;
+        uint32_t innerLoopStride = stride;  // The size of each idex block in cacahe and the size of the A-axis
+        AscendC::LocalTensor<PrompteDtype> dstTensor = tempBuf;
+        AscendC::LocalTensor<PrompteDtype> srcTensor = computeRes;
+        uint32_t cah = cacheID * stride;
+
+        for (uint16_t i = 0; i < outerLoopTimes; ++i) {  // OuterLoopTimes is the size of dimA
+            uint32_t srcIdx = i * outerLoopStride;
+            for (uint16_t j = 0; j < innerLoopTimes; ++j) {
+                AscendC::Max(srcTensor[srcIdx], srcTensor[srcIdx],
+                             dstTensor[srcIdx + j * innerLoopStride],
+                             outerLoopStride);
+                AscendC::PipeBarrier<PIPE_V>();
+            }
+            DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride);
+        }
+    }
+
+    /*!
+     * \brief Binary reduction between two UB buffers.
+     * \      Used for inter-core result merging when workspace staging is required.
+     * \param[in] ubTensorLeft, Left operand (in-place result).
+     * \param[in] ubTensorRight, Right operand (read-only).
+     * \param[in] calCount, Number of elements to reduce.
+     */
+    __aicore__ inline void
+    ReduceBetweenUB(const AscendC::LocalTensor<PrompteDtype> &ubTensorLeft,
+                    const AscendC::LocalTensor<PrompteDtype> &ubTensorRight,
+                    const int32_t &calCount)
+    {
+        AscendC::Max(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
+    }
+
+    /*!
+     * \brief Return the value used for padding when UB alignment is required.
+     *        For SUM-reduction the neutral element is 0.
+     * \tparam U, Scalar type identical to DataType or PromoteDataType.
+     * \return The padding value (always 0).
+     */
+    template <typename U>
+    __aicore__ inline U GetPaddingValue()
+    {
+        // Due to the fact that ReduceSum accumulates R-axis data, the values of the supplemented elements
+        // are set to 0 to ensure that the accumulated result is not affected
+         if(AscendC::IsSameType<U, int32_t>::value){
+            return INT32_MIN;
+         }else if(AscendC::IsSameType<U, uint32_t>::value){
+            return 0;
+         }else{
+            return -1.0f / 0.0f;
+         }
+    }
+
+private:
+    __aicore__ inline void PerformInitialMax(const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum;
+        uint16_t addRTail = param.tailR - addRTotalNum;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
+                AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
+            }
+        } else {
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Max(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Max(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.mainR],
+                    srcTensor[addRTotalNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor<PrompteDtype> &srcTensor,
+        const ReduceARParam& param)
+    {
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) {
+                AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum);
+            }
+        } else {
+            uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum;
+            uint16_t addRTail = param.loopRNum - addRTotalNum;
+            // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+            AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Max(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Max(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.loopRNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+        const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        if constexpr (AscendC::IsSameType<PrompteDtype, float>::value ||
+                      AscendC::IsSameType<PrompteDtype, half>::value) {
+            uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceSum repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMax(
+                    dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        } else if constexpr (AscendC::IsSameType<PrompteDtype, int32_t>::value ||
+                             AscendC::IsSameType<PrompteDtype, uint32_t>::value) {
+            // Cast to float for higher-precision accumulation
+            AscendC::LocalTensor<float> interpreSrc = srcTensor.template ReinterpretCast<float>();
+            AscendC::LocalTensor<float> interpreDst = dstTensor.template ReinterpretCast<float>();
+            AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
+            AscendC::PipeBarrier<PIPE_V>();
+
+            uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceSum repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMax(
+                    interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+            AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
+        }
+    }
+};
+} // namespace ATVC
+
+#endif // ATVC_REDUCE_SUM_H
diff --git a/atvc/include/reduce/reduce_min.h b/atvc/include/reduce/reduce_min.h
new file mode 100644
index 00000000..c1647e7a
--- /dev/null
+++ b/atvc/include/reduce/reduce_min.h
@@ -0,0 +1,341 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATVC_REDUCE_MIN_H
+#define ATVC_REDUCE_MIN_H
+
+#include "common/kernel_utils.h"
+#include "reduce/common/patterns.h"
+#include "reduce/utils/reduce_block_aux_util.h"
+#include "reduce/common/reduce_common.h"
+
+namespace ATVC {
+/*!
+ * ReduceSumCompute This class provides the core arithmetic required to reduce
+ * tensors along either the inner-most (AR) or outer-most (RA) axis after
+ * the tensor has been copied to the Unified Buffer (UB).  Data movement between
+ * Global Memory (GM) and UB is not handled here; it is the responsibility of
+ * the surrounding scheduling template.
+ */
+template <typename OpTraits>
+class ReduceMinCompute {
+public:
+    // 从OpTraits中萃取算子输入描述信息
+    using inputDTypeList = typename OpTraits::In::types;
+    using DataType = typename ATVC::TypeListGet<inputDTypeList, 0>::Type;
+    using PrompteDtype = typename KernelUtils::GetPromoteType<DataType>::T;
+    __aicore__ inline ReduceMinCompute() {}
+
+    /*!
+     * \brief Perform the actual reduction on a tile already resident in UB.
+     * \tparam needMask, True when UB alignment introduced invalid lanes.
+     * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA.
+     * \param[in] shape, {dimA, dimR} in elements; dimR may be padded.
+     * \param[out] dst, Destination tensor (length == dimA)
+     * \param[in] src, Source tensor (length == dimA * dimR)
+     */
+    template <bool needMask, class Pattern>
+    __aicore__ inline void
+    Compute(KernelUtils::Shape<2> &shape,
+            const AscendC::LocalTensor<PrompteDtype> &dst,
+            const AscendC::LocalTensor<PrompteDtype> &src)
+    {
+        // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available:
+        // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative
+        //    calculations can only calculate the actual number of effective elements
+        // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface
+        //    implemented by the user)
+        if (std::is_same<Pattern, ReducePattern::AR>::value) {
+            if constexpr (needMask) { // 1. High performance alignment mode
+                // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary
+                // length (number of elements), where dimR is the original number of elements
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.oriBurstLen);
+            } else {
+                // MainR: The framework provides the calculation of the R-axis binary length (number of elements),
+                // where dimR is the number of elements after completion
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
+                ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.value[1]);
+            }
+        }
+        if (std::is_same<Pattern, ReducePattern::RA>::value) {
+            int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false);
+            ReduceRA(dst, src, shape.value[1], shape.value[0], mainR);
+        }
+    }
+
+    /*!
+     * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis.
+     * \param[out] dst, Output tensor (length == dimA)
+     * \param[in] src, Input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, Length of the non-reduced axis (A)
+     * \param[in] dimR, Length of the reduced axis (R)
+     * \param[in] mainR, Largest power-of-two ≤ dimR (computed by the caller)
+     */
+    __aicore__ inline void
+    ReduceRA(const AscendC::LocalTensor<PrompteDtype> &dst,
+             const AscendC::LocalTensor<PrompteDtype> &src, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR)
+    {
+        uint32_t totalNum = dimR * dimA;
+        uint32_t mainNum = dimA * mainR;
+        constexpr uint32_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t tailNum = totalNum - mainNum;
+        // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes
+        constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
+        uint16_t repeatTimes = tailNum / maskAddNum;
+        uint16_t repeatNum = repeatTimes * maskAddNum;
+        uint16_t repTailNum = tailNum - repeatNum;
+        // Same data block step size between different iterations
+        uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
+        if (repeatTimes > 0) {
+            AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+        }
+        if (repTailNum > 0) {
+            // Same data block step size between different iterations
+            repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+            repeatParams.dstRepStride = repStride;
+            repeatParams.src0RepStride = repStride;
+            repeatParams.src1RepStride = repStride;
+            AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+        uint16_t loopRNum = mainR;
+        while (loopRNum > 1) {
+            loopRNum = loopRNum >> 1;
+            mainNum = loopRNum * dimA; // The first half of LoopR's data volume
+            repeatTimes = mainNum / maskAddNum;
+            repeatNum = repeatTimes * maskAddNum;
+            repTailNum = mainNum - repeatNum;
+            if (repeatTimes > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+            }
+            if (repTailNum > 0) {
+                // Same data block step size between different iterations
+                repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+                repeatParams.dstRepStride = repStride;
+                repeatParams.src0RepStride = repStride;
+                repeatParams.src1RepStride = repStride;
+                AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        }
+        AscendC::DataCopy(dst, src, dimA);
+    }
+
+    /*!
+     * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis.
+     * \param[out] dstTensor, Output tensor (length == dimA)
+     * \param[in] srcTensor, Input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, Length of the non-reduced axis (A)
+     * \param[in] dimR, Padded length of the reduced axis (R)
+     * \param[in] mainR, Largest power-of-two ≤ original R length
+     * \param[in] oriBurstLen, Original (un-padded) R length used to compute tail
+     */
+    __aicore__ inline void
+    ReduceAR(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+             const AscendC::LocalTensor<PrompteDtype> &srcTensor, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen)
+    {
+        uint16_t tailR = oriBurstLen - mainR;
+        constexpr uint16_t dtypeSize = sizeof(PrompteDtype);
+        uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32;
+        uint16_t dimMax = dimA * dimR;
+        constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize;
+
+        ReduceARParam param{
+            .repStride   = repStride,
+            .dimA        = dimA,
+            .dimMax      = dimMax,
+            .mainR       = mainR,
+            .tailR       = tailR,
+            .maskAddRNum = maskAddRNum,
+            .dtypeSize   = dtypeSize,
+            .dimR        = dimR
+        };
+
+        if (mainR > 0 && tailR > 0) {
+            PerformInitialMin(srcTensor, param);
+        }
+
+        param.loopRNum = mainR;
+        while (param.loopRNum > maskAddRNum) {
+            param.loopRNum = param.loopRNum / 2U;
+            PerformBinaryReduction(srcTensor, param);
+        }
+        if (param.loopRNum == 0) { // small shape, directly reduce
+            param.loopRNum = tailR;
+        }
+        PerformFinalReduction(dstTensor, srcTensor, param);
+    }
+
+    template <class Pattern, class V>
+    __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor<PrompteDtype>& tempBuf,
+                                       const AscendC::LocalTensor<PrompteDtype>& computeRes)
+    {
+        int64_t cacheID = KernelUtils::Reduce::GetCacheID(index);
+        int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0];
+        int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype);
+        int64_t stride = OpsUtils::CeilDiv(dimA, static_cast<int64_t>(element_one_repeat)) * element_one_repeat;
+        uint16_t outerLoopTimes = OpsUtils::CeilDiv(
+            static_cast<int64_t>(dimA * sizeof(PrompteDtype)), static_cast<int64_t>(Platform::GetVRegSize()));
+        uint16_t innerLoopTimes = cacheID;
+        uint32_t outerLoopStride = element_one_repeat;
+        uint32_t innerLoopStride = stride;  // The size of each idex block in cacahe and the size of the A-axis
+        AscendC::LocalTensor<PrompteDtype> dstTensor = tempBuf;
+        AscendC::LocalTensor<PrompteDtype> srcTensor = computeRes;
+        uint32_t cah = cacheID * stride;
+
+        for (uint16_t i = 0; i < outerLoopTimes; ++i) {  // OuterLoopTimes is the size of dimA
+            uint32_t srcIdx = i * outerLoopStride;
+            for (uint16_t j = 0; j < innerLoopTimes; ++j) {
+                AscendC::Min(srcTensor[srcIdx], srcTensor[srcIdx],
+                             dstTensor[srcIdx + j * innerLoopStride],
+                             outerLoopStride);
+                AscendC::PipeBarrier<PIPE_V>();
+            }
+            DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride);
+        }
+    }
+
+    /*!
+     * \brief Binary reduction between two UB buffers.
+     * \      Used for inter-core result merging when workspace staging is required.
+     * \param[in] ubTensorLeft, Left operand (in-place result).
+     * \param[in] ubTensorRight, Right operand (read-only).
+     * \param[in] calCount, Number of elements to reduce.
+     */
+    __aicore__ inline void
+    ReduceBetweenUB(const AscendC::LocalTensor<PrompteDtype> &ubTensorLeft,
+                    const AscendC::LocalTensor<PrompteDtype> &ubTensorRight,
+                    const int32_t &calCount)
+    {
+        AscendC::Min(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
+    }
+
+    /*!
+     * \brief Return the value used for padding when UB alignment is required.
+     *        For SUM-reduction the neutral element is 0.
+     * \tparam U, Scalar type identical to DataType or PromoteDataType.
+     * \return The padding value (always 0).
+     */
+    template <typename U>
+    __aicore__ inline U GetPaddingValue()
+    {
+        // Due to the fact that ReduceSum accumulates R-axis data, the values of the supplemented elements
+        // are set to 0 to ensure that the accumulated result is not affected
+         if(AscendC::IsSameType<U, int32_t>::value){
+            return INT32_MAX;
+         }else if(AscendC::IsSameType<U, uint32_t>::value){
+            return INT32_MAX - INT32_MIN;
+         }else{
+            return 1.0f / 0.0f;
+         }
+    }
+
+private:
+    __aicore__ inline void PerformInitialMin(const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum;
+        uint16_t addRTail = param.tailR - addRTotalNum;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
+                AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
+            }
+        } else {
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Min(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Min(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.mainR],
+                    srcTensor[addRTotalNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor<PrompteDtype> &srcTensor,
+        const ReduceARParam& param)
+    {
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) {
+                AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum);
+            }
+        } else {
+            uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum;
+            uint16_t addRTail = param.loopRNum - addRTotalNum;
+            // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+            AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Min(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Min(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.loopRNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+        const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        if constexpr (AscendC::IsSameType<PrompteDtype, float>::value ||
+                      AscendC::IsSameType<PrompteDtype, half>::value) {
+            uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceSum repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMin(
+                    dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        } else if constexpr (AscendC::IsSameType<PrompteDtype, int32_t>::value ||
+                             AscendC::IsSameType<PrompteDtype, uint32_t>::value) {
+            // Cast to float for higher-precision accumulation
+            AscendC::LocalTensor<float> interpreSrc = srcTensor.template ReinterpretCast<float>();
+            AscendC::LocalTensor<float> interpreDst = dstTensor.template ReinterpretCast<float>();
+            AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
+            AscendC::PipeBarrier<PIPE_V>();
+
+            uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceSum repeat-time limit is 255; split dimA into chunks
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceMin(
+                    interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+            AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
+        }
+    }
+};
+} // namespace ATVC
+
+#endif // ATVC_REDUCE_SUM_H
diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h
index fe466a09..d7d34862 100644
--- a/atvc/include/reduce/reduce_op_template.h
+++ b/atvc/include/reduce/reduce_op_template.h
@@ -117,7 +117,9 @@ public:
     template <bool IsInput, bool needDup = true, typename T>
     __aicore__ inline void AllocTensorAux(AscendC::LocalTensor<T>& tensor)
     {
-        bufPool_.AllocTensor<IsInput, needDup>(tensor);
+        T DupValue = needDup ? compute_.template GetPaddingValue<T>() : 0;
+        bufPool_.AllocTensor<IsInput, needDup>(tensor, DupValue);
+        // bufPool_.AllocTensor<IsInput, needDup>(tensor);
     }
 
     /*!
diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h
index 08a24bf0..7870e3eb 100644
--- a/atvc/include/reduce/reduce_sum.h
+++ b/atvc/include/reduce/reduce_sum.h
@@ -15,20 +15,7 @@
 #include "common/kernel_utils.h"
 #include "reduce/common/patterns.h"
 #include "reduce/utils/reduce_block_aux_util.h"
-
-namespace {
-struct ReduceARParam {
-    uint32_t repStride = 0;
-    uint16_t dimA = 0;
-    uint16_t dimMax = 0;
-    uint16_t mainR = 0;
-    uint16_t tailR = 0;
-    uint64_t maskAddRNum = 0;
-    uint16_t loopRNum = 0;
-    uint16_t dtypeSize = 0;
-    uint16_t dimR = 0;
-};
-}
+#include "reduce/common/reduce_common.h"
 
 namespace ATVC {
 /*!
diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h
index a68a169f..3f7639cb 100644
--- a/atvc/include/reduce/utils/reduce_block_aux.h
+++ b/atvc/include/reduce/utils/reduce_block_aux.h
@@ -234,7 +234,7 @@ public:
             computeTensor = ubTensor;
         } else {
             // The index of AlloccomputeTensorAux does not require external perception
-            op_->ReduceOp::template AllocTensorAux<false>(computeTensor);
+            op_->ReduceOp::template AllocTensorAux<false, false>(computeTensor);
             CopyIn<isPadding>(view, shape, ubTensor);
             SetEvent<AscendC::HardEvent::MTE2_V>(AscendC::HardEvent::MTE2_V);
             AscendC::Cast(computeTensor, ubTensor, AscendC::RoundMode::CAST_NONE, shape.value[0] * shape.value[1]);
@@ -246,14 +246,21 @@ public:
     __aicore__ inline void LinearComputeR(int64_t& tmpBufOffest, V& shape, Args... args)
     {
         SliceView<MAX_DIM> view;
+        bool needDup = false;
         for (int64_t i = 0; i < bisectionTail_; i++) {
             AscendC::LocalTensor<InDtype> tensorLeft;
-            op_->ReduceOp::template AllocTensorAux<true>(tensorLeft);
+            op_->ReduceOp::template AllocTensorAux<true, false>(tensorLeft);
             AscendC::LocalTensor<PromoteDataType> computeLeft;
             PrePareReduce<(!InnerPattern::TailA), false>(i, view, shape, tensorLeft, computeLeft);
             
             AscendC::LocalTensor<InDtype> tensorRight;
-            op_->ReduceOp::template AllocTensorAux<true>(tensorRight);
+            needDup = i == bisectionTail_ - 1;
+            if(needDup){
+                op_->ReduceOp::template AllocTensorAux<true, true>(tensorRight);
+            }else{
+                op_->ReduceOp::template AllocTensorAux<true, false>(tensorRight);
+            }
+            // op_->ReduceOp::template AllocTensorAux<true>(tensorRight);
             AscendC::LocalTensor<PromoteDataType> computeRight;
             PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight);
             ComputeMerge(shape, computeLeft, computeRight, args...);
@@ -266,7 +273,13 @@ public:
 
         for (int64_t i = bisectionTail_; i < bisectionPos_; i++) {
             AscendC::LocalTensor<InDtype> tensor;
-            op_->ReduceOp::template AllocTensorAux<true>(tensor);
+            needDup = i == bisectionPos_ -1;
+            if(needDup){
+                op_->ReduceOp::template AllocTensorAux<true, true>(tensor);
+            }else{
+                op_->ReduceOp::template AllocTensorAux<true, false>(tensor);
+            }
+            // op_->ReduceOp::template AllocTensorAux<true>(tensor);
             AscendC::LocalTensor<PromoteDataType> computeLeft;
             PrePareReduce<(!InnerPattern::TailA && Pattern::Dim > 2), false>(i, view, shape, tensor, computeLeft);
             Compute(shape, computeLeft, args...);
diff --git a/atvc/include/reduce/utils/reduce_buf_pool.h b/atvc/include/reduce/utils/reduce_buf_pool.h
index e6779444..18d7328c 100644
--- a/atvc/include/reduce/utils/reduce_buf_pool.h
+++ b/atvc/include/reduce/utils/reduce_buf_pool.h
@@ -53,7 +53,10 @@ public:
         // Init buffer
         pipe_->InitBuffer(qQue_, poolSize);
         AscendC::LocalTensor<DataType> inputUb = qQue_.GetWithOffset<DataType>(basicNum_ * inputNum, 0);
-        AscendC::Duplicate<DataType>(inputUb, 0, basicNum_ * inputNum);
+        // AscendC::Duplicate<DataType>(inputUb, 0, basicNum_ * inputNum);
+        for(int16_t i =0;i<MAX_INPUT_SIZE;i++){
+            usedTBuf_[i] = false;
+        }
     }
 
     __aicore__ inline void ResetEvent()
@@ -74,16 +77,21 @@ public:
     }
 
     template <bool IsInput, bool needDup=true, typename T>
-    __aicore__ inline const void AllocTensor(AscendC::LocalTensor<T>& tensor) {
+    __aicore__ inline const void AllocTensor(AscendC::LocalTensor<T>& tensor, T DupValue) {
         if constexpr (IsInput) {
             int32_t idx = GetInputTensorId();
             tensor = qQue_.GetWithOffset<T>(basicNum_, inputUnit_.offset + idx * basicNum_ * sizeof(T));
             if constexpr (needDup) {
-                AscendC::Duplicate<T>(tensor, 0, basicNum_);
+                AscendC::PipeBarrier<PIPE_V>();
+                AscendC::Duplicate<T>(tensor, DupValue, basicNum_);
+            }
+            if(usedTBuf_[idx] || needDup){
                 event_t allocEventId = static_cast<event_t>(GetTPipePtr()->FetchEventID<AscendC::HardEvent::V_MTE2>());
                 eventIdV2Mte2_[idx] = allocEventId;
+                needWaitFlag_[idx] = true;
                 AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(allocEventId);
             }
+            usedTBuf_[idx] = true;
         } else {
             int32_t idx = GetComputeTensorId();
             tensor = qQue_.GetWithOffset<T>(basicNum_, computeUnit_.offset + idx * basicNum_ * sizeof(T));
@@ -102,7 +110,11 @@ public:
         uint64_t offset = (uint64_t)(tensor.GetPhyAddr());
         if (offset - start < computeUnit_.offset) {
             int32_t idx = (offset - start) / sizeof(T) / basicNum_;
-            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventIdV2Mte2_[idx]);
+            // AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventIdV2Mte2_[idx]);
+            if(needWaitFlag_[idx]){
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventIdV2Mte2_[idx]);
+                needWaitFlag_[idx] = false;
+            }
         }
     }
 
@@ -123,6 +135,8 @@ private:
     PoolManagerUnit inputUnit_;
     PoolManagerUnit computeUnit_;
     event_t eventIdV2Mte2_[MAX_INPUT_SIZE];
+    bool needWaitFlag_[MAX_INPUT_SIZE];
+    bool usedTBuf_[MAX_INPUT_SIZE];
     AscendC::TBuf<> qQue_;
     AscendC::TPipe* pipe_;
     int32_t basicNum_;
-- 
Gitee


算子类型(OpType)	ReduceSum
算子输入
	name	shape	data type	format
	x	8 * 2048	float	ND

算子输出
算子输出	y	1 * 2048	float	ND
核函数名	ReduceCustom