From 9a56f2033d4891c938f444a88f6c686fa517078a Mon Sep 17 00:00:00 2001 From: chen_g_tian <1581528722@qq.com> Date: Fri, 29 Aug 2025 17:01:07 +0800 Subject: [PATCH] =?UTF-8?q?Extend=20atvc=20reduce=20template=EF=BC=9Areduc?= =?UTF-8?q?e=5Fmax=20&=20reduce=5Fmin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- atvc/examples/reduce_max/README.md | 46 +++ atvc/examples/reduce_max/reduce_max.cpp | 173 ++++++++++ atvc/examples/reduce_min/README.md | 46 +++ atvc/examples/reduce_min/reduce_min.cpp | 173 ++++++++++ atvc/include/atvc.h | 2 + atvc/include/reduce/common/reduce_common.h | 12 + atvc/include/reduce/reduce_device.h | 2 + atvc/include/reduce/reduce_max.h | 341 +++++++++++++++++++ atvc/include/reduce/reduce_min.h | 341 +++++++++++++++++++ atvc/include/reduce/reduce_op_template.h | 4 +- atvc/include/reduce/reduce_sum.h | 15 +- atvc/include/reduce/utils/reduce_block_aux.h | 21 +- atvc/include/reduce/utils/reduce_buf_pool.h | 22 +- 13 files changed, 1175 insertions(+), 23 deletions(-) create mode 100644 atvc/examples/reduce_max/README.md create mode 100644 atvc/examples/reduce_max/reduce_max.cpp create mode 100644 atvc/examples/reduce_min/README.md create mode 100644 atvc/examples/reduce_min/reduce_min.cpp create mode 100644 atvc/include/reduce/reduce_max.h create mode 100644 atvc/include/reduce/reduce_min.h diff --git a/atvc/examples/reduce_max/README.md b/atvc/examples/reduce_max/README.md new file mode 100644 index 00000000..ebfbdd8f --- /dev/null +++ b/atvc/examples/reduce_max/README.md @@ -0,0 +1,46 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMax单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_max.cpp](./reduce_max.cpp) | ReduceSum算子代码实现以及调用样例 | + +## 算子描述 + +ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceSum算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceSum
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +$ cd ./atvc/examples +$ bash run_examples.sh reduce_max +... +Generate golden data successfully. +... +Accuracy verification passed. +``` \ No newline at end of file diff --git a/atvc/examples/reduce_max/reduce_max.cpp b/atvc/examples/reduce_max/reduce_max.cpp new file mode 100644 index 00000000..0b64bfdd --- /dev/null +++ b/atvc/examples/reduce_max/reduce_max.cpp @@ -0,0 +1,173 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + printf("[ERROR]: Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + printf("[ERROR]: Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/examples/reduce_min/README.md b/atvc/examples/reduce_min/README.md new file mode 100644 index 00000000..d4af048d --- /dev/null +++ b/atvc/examples/reduce_min/README.md @@ -0,0 +1,46 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMin单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_min.cpp](./reduce_min.cpp) | ReduceSum算子代码实现以及调用样例 | + +## 算子描述 + +ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceSum算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceSum
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +$ cd ./atvc/examples +$ bash run_examples.sh reduce_min +... +Generate golden data successfully. +... +Accuracy verification passed. +``` \ No newline at end of file diff --git a/atvc/examples/reduce_min/reduce_min.cpp b/atvc/examples/reduce_min/reduce_min.cpp new file mode 100644 index 00000000..6fd71ca4 --- /dev/null +++ b/atvc/examples/reduce_min/reduce_min.cpp @@ -0,0 +1,173 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + printf("[ERROR]: Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + printf("[ERROR]: Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/include/atvc.h b/atvc/include/atvc.h index 37b9c555..6073408d 100644 --- a/atvc/include/atvc.h +++ b/atvc/include/atvc.h @@ -28,6 +28,8 @@ #include "common/kernel_utils.h" #include "elewise/elewise_op_template.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #include "broadcast/broadcast_compute.h" #include "broadcast/broadcast_op_template.h" diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index 480c04d2..d5722402 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -29,6 +29,18 @@ enum ShapeDim { DIM_BROADCAST // Broadcast axis }; +struct ReduceARParam { + uint32_t repStride = 0; + uint16_t dimA = 0; + uint16_t dimMax = 0; + uint16_t mainR = 0; + uint16_t tailR = 0; + uint64_t maskAddRNum = 0; + uint16_t loopRNum = 0; + uint16_t dtypeSize = 0; + uint16_t dimR = 0; +}; + namespace AR_PATTERN { static constexpr uint32_t A = 100; static constexpr uint32_t AR = 11; diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h index a271e8f1..c1611078 100644 --- a/atvc/include/reduce/reduce_device.h +++ b/atvc/include/reduce/reduce_device.h @@ -25,6 +25,8 @@ #include "common/kernel_utils.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #endif // ATVC_REDUCE_DEVICE_H \ No newline at end of file diff --git a/atvc/include/reduce/reduce_max.h b/atvc/include/reduce/reduce_max.h new file mode 100644 index 00000000..906f9c08 --- /dev/null +++ b/atvc/include/reduce/reduce_max.h @@ -0,0 +1,341 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MAX_H +#define ATVC_REDUCE_MAX_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceSumCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMaxCompute { +public: + // 从OpTraits中萃取算子输入描述信息 + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMaxCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, True when UB alignment introduced invalid lanes. + * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, Destination tensor (length == dimA) + * \param[in] src, Source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, Output tensor (length == dimA) + * \param[in] src, Input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, Length of the non-reduced axis (A) + * \param[in] dimR, Length of the reduced axis (R) + * \param[in] mainR, Largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, Output tensor (length == dimA) + * \param[in] srcTensor, Input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, Length of the non-reduced axis (A) + * \param[in] dimR, Padded length of the reduced axis (R) + * \param[in] mainR, Largest power-of-two ≤ original R length + * \param[in] oriBurstLen, Original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMax(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Max(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, Left operand (in-place result). + * \param[in] ubTensorRight, Right operand (read-only). + * \param[in] calCount, Number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Max(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For SUM-reduction the neutral element is 0. + * \tparam U, Scalar type identical to DataType or PromoteDataType. + * \return The padding value (always 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceSum accumulates R-axis data, the values of the supplemented elements + // are set to 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MIN; + }else if(AscendC::IsSameType::value){ + return 0; + }else{ + return -1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMax(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceSum repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceSum repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_SUM_H diff --git a/atvc/include/reduce/reduce_min.h b/atvc/include/reduce/reduce_min.h new file mode 100644 index 00000000..c1647e7a --- /dev/null +++ b/atvc/include/reduce/reduce_min.h @@ -0,0 +1,341 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MIN_H +#define ATVC_REDUCE_MIN_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceSumCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMinCompute { +public: + // 从OpTraits中萃取算子输入描述信息 + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMinCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, True when UB alignment introduced invalid lanes. + * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, Destination tensor (length == dimA) + * \param[in] src, Source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, Output tensor (length == dimA) + * \param[in] src, Input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, Length of the non-reduced axis (A) + * \param[in] dimR, Length of the reduced axis (R) + * \param[in] mainR, Largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, Output tensor (length == dimA) + * \param[in] srcTensor, Input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, Length of the non-reduced axis (A) + * \param[in] dimR, Padded length of the reduced axis (R) + * \param[in] mainR, Largest power-of-two ≤ original R length + * \param[in] oriBurstLen, Original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMin(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Min(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, Left operand (in-place result). + * \param[in] ubTensorRight, Right operand (read-only). + * \param[in] calCount, Number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Min(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For SUM-reduction the neutral element is 0. + * \tparam U, Scalar type identical to DataType or PromoteDataType. + * \return The padding value (always 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceSum accumulates R-axis data, the values of the supplemented elements + // are set to 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MAX; + }else if(AscendC::IsSameType::value){ + return INT32_MAX - INT32_MIN; + }else{ + return 1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMin(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceSum repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceSum repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_SUM_H diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h index fe466a09..d7d34862 100644 --- a/atvc/include/reduce/reduce_op_template.h +++ b/atvc/include/reduce/reduce_op_template.h @@ -117,7 +117,9 @@ public: template __aicore__ inline void AllocTensorAux(AscendC::LocalTensor& tensor) { - bufPool_.AllocTensor(tensor); + T DupValue = needDup ? compute_.template GetPaddingValue() : 0; + bufPool_.AllocTensor(tensor, DupValue); + // bufPool_.AllocTensor(tensor); } /*! diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h index 08a24bf0..7870e3eb 100644 --- a/atvc/include/reduce/reduce_sum.h +++ b/atvc/include/reduce/reduce_sum.h @@ -15,20 +15,7 @@ #include "common/kernel_utils.h" #include "reduce/common/patterns.h" #include "reduce/utils/reduce_block_aux_util.h" - -namespace { -struct ReduceARParam { - uint32_t repStride = 0; - uint16_t dimA = 0; - uint16_t dimMax = 0; - uint16_t mainR = 0; - uint16_t tailR = 0; - uint64_t maskAddRNum = 0; - uint16_t loopRNum = 0; - uint16_t dtypeSize = 0; - uint16_t dimR = 0; -}; -} +#include "reduce/common/reduce_common.h" namespace ATVC { /*! diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h index a68a169f..3f7639cb 100644 --- a/atvc/include/reduce/utils/reduce_block_aux.h +++ b/atvc/include/reduce/utils/reduce_block_aux.h @@ -234,7 +234,7 @@ public: computeTensor = ubTensor; } else { // The index of AlloccomputeTensorAux does not require external perception - op_->ReduceOp::template AllocTensorAux(computeTensor); + op_->ReduceOp::template AllocTensorAux(computeTensor); CopyIn(view, shape, ubTensor); SetEvent(AscendC::HardEvent::MTE2_V); AscendC::Cast(computeTensor, ubTensor, AscendC::RoundMode::CAST_NONE, shape.value[0] * shape.value[1]); @@ -246,14 +246,21 @@ public: __aicore__ inline void LinearComputeR(int64_t& tmpBufOffest, V& shape, Args... args) { SliceView view; + bool needDup = false; for (int64_t i = 0; i < bisectionTail_; i++) { AscendC::LocalTensor tensorLeft; - op_->ReduceOp::template AllocTensorAux(tensorLeft); + op_->ReduceOp::template AllocTensorAux(tensorLeft); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA), false>(i, view, shape, tensorLeft, computeLeft); AscendC::LocalTensor tensorRight; - op_->ReduceOp::template AllocTensorAux(tensorRight); + needDup = i == bisectionTail_ - 1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensorRight); + }else{ + op_->ReduceOp::template AllocTensorAux(tensorRight); + } + // op_->ReduceOp::template AllocTensorAux(tensorRight); AscendC::LocalTensor computeRight; PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight); ComputeMerge(shape, computeLeft, computeRight, args...); @@ -266,7 +273,13 @@ public: for (int64_t i = bisectionTail_; i < bisectionPos_; i++) { AscendC::LocalTensor tensor; - op_->ReduceOp::template AllocTensorAux(tensor); + needDup = i == bisectionPos_ -1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensor); + }else{ + op_->ReduceOp::template AllocTensorAux(tensor); + } + // op_->ReduceOp::template AllocTensorAux(tensor); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA && Pattern::Dim > 2), false>(i, view, shape, tensor, computeLeft); Compute(shape, computeLeft, args...); diff --git a/atvc/include/reduce/utils/reduce_buf_pool.h b/atvc/include/reduce/utils/reduce_buf_pool.h index e6779444..18d7328c 100644 --- a/atvc/include/reduce/utils/reduce_buf_pool.h +++ b/atvc/include/reduce/utils/reduce_buf_pool.h @@ -53,7 +53,10 @@ public: // Init buffer pipe_->InitBuffer(qQue_, poolSize); AscendC::LocalTensor inputUb = qQue_.GetWithOffset(basicNum_ * inputNum, 0); - AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + // AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + for(int16_t i =0;i - __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor) { + __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor, T DupValue) { if constexpr (IsInput) { int32_t idx = GetInputTensorId(); tensor = qQue_.GetWithOffset(basicNum_, inputUnit_.offset + idx * basicNum_ * sizeof(T)); if constexpr (needDup) { - AscendC::Duplicate(tensor, 0, basicNum_); + AscendC::PipeBarrier(); + AscendC::Duplicate(tensor, DupValue, basicNum_); + } + if(usedTBuf_[idx] || needDup){ event_t allocEventId = static_cast(GetTPipePtr()->FetchEventID()); eventIdV2Mte2_[idx] = allocEventId; + needWaitFlag_[idx] = true; AscendC::SetFlag(allocEventId); } + usedTBuf_[idx] = true; } else { int32_t idx = GetComputeTensorId(); tensor = qQue_.GetWithOffset(basicNum_, computeUnit_.offset + idx * basicNum_ * sizeof(T)); @@ -102,7 +110,11 @@ public: uint64_t offset = (uint64_t)(tensor.GetPhyAddr()); if (offset - start < computeUnit_.offset) { int32_t idx = (offset - start) / sizeof(T) / basicNum_; - AscendC::WaitFlag(eventIdV2Mte2_[idx]); + // AscendC::WaitFlag(eventIdV2Mte2_[idx]); + if(needWaitFlag_[idx]){ + AscendC::WaitFlag(eventIdV2Mte2_[idx]); + needWaitFlag_[idx] = false; + } } } @@ -123,6 +135,8 @@ private: PoolManagerUnit inputUnit_; PoolManagerUnit computeUnit_; event_t eventIdV2Mte2_[MAX_INPUT_SIZE]; + bool needWaitFlag_[MAX_INPUT_SIZE]; + bool usedTBuf_[MAX_INPUT_SIZE]; AscendC::TBuf<> qQue_; AscendC::TPipe* pipe_; int32_t basicNum_; -- Gitee