From 3c0b1282a97a4b521ed1dd67217529da2cdff97c Mon Sep 17 00:00:00 2001 From: zhaoguoang Date: Mon, 1 Sep 2025 14:21:16 +0800 Subject: [PATCH] sync atvc --- atvc/README.md | 11 +- atvc/docs/01_quick_start.md | 4 +- atvc/docs/02_developer_guide.md | 374 +++++++++++------- atvc/docs/03_code_organization.md | 6 +- .../add_with_broadcast/add_with_broadcast.cpp | 60 +-- .../add_with_scalar/add_with_scalar.cpp | 2 +- atvc/examples/broadcast_to/broadcast_to.cpp | 2 +- atvc/examples/common/example_common.h | 132 +++---- atvc/examples/sinh_custom/sinh_custom.cpp | 2 +- atvc/include/broadcast/broadcast_compute.h | 15 +- atvc/include/broadcast/broadcast_host.h | 8 + .../include/broadcast/broadcast_op_template.h | 74 ++-- atvc/include/broadcast/common/patterns.h | 18 +- .../broadcast/tiling/broadcast_tiling.h | 51 +-- .../broadcast/utils/broadcast_buf_pool.h | 49 +-- atvc/include/common/ops_utils_device.h | 154 ++++---- atvc/include/elewise/common/elewise_common.h | 21 +- atvc/include/elewise/elewise_device.h | 3 +- atvc/include/elewise/elewise_host.h | 63 ++- atvc/include/elewise/elewise_op_template.h | 67 ++-- .../{elewise_utils => utils}/elewise_util.h | 22 +- atvc/include/reduce/common/reduce_common.h | 30 +- atvc/include/reduce/reduce_host.h | 4 +- atvc/include/reduce/reduce_op_template.h | 15 +- atvc/include/reduce/reduce_sum.h | 49 ++- atvc/include/reduce/tiling/tiling_common.h | 232 +++++------ 26 files changed, 804 insertions(+), 664 deletions(-) rename atvc/include/elewise/{elewise_utils => utils}/elewise_util.h (82%) diff --git a/atvc/README.md b/atvc/README.md index c7f47b65..ef05c565 100644 --- a/atvc/README.md +++ b/atvc/README.md @@ -1,5 +1,3 @@ - - # ATVC ATVC(Ascend C Template for Vector Compute)是一个用Ascend C API搭建的C++模板头文件集合,旨在帮助用户快速开发Ascend C典型Vector算子。它将Ascend C Vector算子开发流程中的计算实现解耦成可自定义的模块, 内部封装实现了kernel数据搬入搬出等底层通用操作及通用tiling计算,实现了高效的算子开发模式。 相比传统Ascend C算子开发方式,利用ATVC搭建的Vector算子可做到开发效率提升3-5倍。用户只需选择匹配的模板并完成核心计算逻辑就完成算子kernel侧开发,atvc还内置了每个模板库对应的通用tiling计算实现,可省去用户手写tiling的开发量就能达到不错的性能表现,极大提升算子开发效率。 @@ -24,7 +22,6 @@ ATVC工程结构可归纳成以下主要组件,更详细的文件结构介绍 ├── docs // 文档介绍 ├── examples // ATVC使用样例 ├── include // ATVC提供的头文件集合,用户使用前需将其置入其他工程的包含路径下 -├── tests // 测试模块相关代码 └── README.md // 综述 ``` # 快速上手 @@ -56,12 +53,8 @@ git clone https://gitee.com/ascend/ascendc-api-adv.git - 执行add用例 ```bash -$ cd ./atvc/examples -$ bash run_examples.sh add -... -Generate golden data successfully. -... -Accuracy verification passed. +cd ./atvc/examples +bash run_examples.sh add ``` diff --git a/atvc/docs/01_quick_start.md b/atvc/docs/01_quick_start.md index 066a14b4..28b12d4f 100644 --- a/atvc/docs/01_quick_start.md +++ b/atvc/docs/01_quick_start.md @@ -114,8 +114,8 @@ AddCustom<<>>(aDevice, bDevice, cDevice ## 算子编译&执行 完成算子代码编写后,调用以下命令编译代码并执行: ```bash -$ cd ./atvc/examples -$ bash run_examples.sh add +cd ./atvc/examples +bash run_examples.sh add ``` 其他样例执行命令如下: diff --git a/atvc/docs/02_developer_guide.md b/atvc/docs/02_developer_guide.md index dccadd4a..50e33291 100644 --- a/atvc/docs/02_developer_guide.md +++ b/atvc/docs/02_developer_guide.md @@ -158,16 +158,28 @@ struct SinhComputeFunc { 下方为`ATVC::Kernel::EleWiseOpTemplate`模板类的外部接口介绍,完整模板类定义请参考[`atvc/include/elewise/elewise_op_template.h`](../include/elewise/elewise_op_template.h)。 ```cpp -// 开发定义的计算模板类 +/*! + * \brief EleWiseOpTemplate provides templates for element level operations on tensors, + * including but not limited to addition, subtraction, multiplication, division, as well as + * mathematical functions such as exponentiation, logarithm, trigonometric functions, etc. + * The characteristic of this type of operator is that it performs calculation operations + * element by element without changing the shape of the input data. + */ template class EleWiseOpTemplate { public: - __aicore__ inline EleWiseOpTemplate(){}; - - // 按照输入、输出、EleWiseParam、其他标量的顺序传入 - // 内部根据EleWiseParam进行数据调度并调用EleWiseOpTemplate完成计算后搬出到GM - template - __aicore__ inline void Run(Args&&... args) { + __aicore__ inline EleWiseOpTemplate() {}; + + /*! + * \brief The external running interface of EleWiseOpTemplate mainly completes resource initialization, + * data migration, calculation scheduling, and data migration operations + * \param src, GM pointer for input data + * \param dst, Gm pointer for outputting data + * \param broadcastParam, Dynamic parameters of broadcast, including tiling data, workspace, etc + */ + template + __aicore__ inline void Run(Args&&... args) + { // // 完成变长参数的解析和数据调度计算 // @@ -244,17 +256,17 @@ ATVC的Host层提供了Elementwise算子的通用Tiling算法API `ATVC::Host::Ca ```cpp namespace ATVC{ struct EleWiseTilingData { - uint32_t tailBlockCnt; // 需要额外执行一次循环的核的数量 - uint32_t tailElemCnt; // 尾块元素数量 - uint32_t numPerBlock; // 每个核需计算的总元素数量 - uint32_t tiledCnt; // 每次搬入搬出的元素个数 - uint32_t blockNum; // 执行核数 + uint32_t tailBlockCnt; // The number of cores that need to execute an additional loop + uint32_t tailElemCnt; // The number of tail block elements + uint32_t numPerBlock; // The number of basic blocks to be calculated for each core + uint32_t tiledCnt; // The number of basic block elements + uint32_t blockNum; // Execute audit }; struct EleWiseParam { - EleWiseTilingData tilingData; // 影响数据搬运的相关参数 - uint32_t totalCnt = 0; // 单Tensor的元素个数 - uint32_t nBufferNum = 2; // 每个Queue中的Tensor数量 + EleWiseTilingData tilingData; // Related parameters affecting data handling + uint32_t totalCnt = 0; // The number of elements in a single Tensor + uint32_t nBufferNum = 2; // The number of Tensors in each queue }; } ``` @@ -264,23 +276,17 @@ struct EleWiseParam { ```cpp // 传入编译态参数ATVC::OpTraits,函数内部将萃取该模板参数获取算子信息 template -/** - * @brief 计算EleWise的EleWiseParam运行态参数 - * @param totalCnt 单个输入的总元素个数 - * @param param 输出参数。 - * @return bool 返回true表示计算成功,false表示失败。 +/*! + * \brief Calculate the operational parameters of EleWiseParam for EleWise + * \param[in] totalCnt, The total number of elements in a single input + * \param[out] param, Output parameters. + * \return Return true to indicate calculation success, false to indicate failure. */ -bool CalcEleWiseTiling(int32_t totalCnt, ATVC::EleWiseParam ¶m) +template +bool CalcEleWiseTiling( + int32_t totalCnt, ATVC::EleWiseParam ¶m, EleWiseTilingHyperParam hyperParam = EleWiseTilingHyperParam()) { - // 影响Tiling算法的超参数结构体,支持用户修改来获取更高性能收益 - struct EleWiseTilingHyperParam { - int32_t basicCntBase = 16 * 1024; // 基块单次搬运的初始元素个数,推荐在[1K, 32K]的范围内变动 - int nBufferNum = 2; // 每个Queue中的Tensor数量,推荐在[2, 3]的候选址内变动 - }; - // 萃取Traits得到输入输出等类型和个数的信息 - // 计算最终TilingData并填入param中 // ... - return true; } // Host侧调用示例 @@ -309,7 +315,7 @@ Reduce模板算子内部根据计算的数据大小、shape、Reduce axis轴完 ![reduce_components.png](images/reduce_components.png) 自定义Reduce算子需按照以下顺序完成模块之间的组装: 1. 自定义计算模板/使用框架内置计算模板 -2. 将计算模板传入`Kernel`层模板算子完成核函数功能实现; +2. 将计算模板传入Kernel层模板算子完成核函数功能实现; 3. 定义Kernel层算子入口API,内部实例化计算模板类; 下面将以ReduceSum(对输入tensor的特定轴上的数据做求和操作)的算子搭建为样例,按照组装顺序介绍Reduce算子类的开发流程。 @@ -318,7 +324,7 @@ Reduce模板算子内部根据计算的数据大小、shape、Reduce axis轴完 ### 2.2.2 计算模板 Reduce类的计算模板涉及多核之间的数据结果同步以及核内分块的对齐计算,用户自定义难度较高,因此ATVC框架提供了Reduce类的内置计算模板,并实现了Reduce在单核内与多核间的计算与同步等函数接口。 这类计算模板将作为模板参数传入`ATVC::ReduceOpTemplate`中,并在数据计算以及同步阶段被调用。 -下方为ATVC框架内置的`ATVC::ReduceSumCompute`计算模板的伪代码介绍,完成代码定义请参考`atvc/include/reduce/reduce_sum.h`。 +下方为ATVC框架内置的`ATVC::ReduceSumCompute`计算模板的伪代码介绍,完成代码定义请参考[reduce_sum.h](../include/reduce/reduce_sum.h)。 ```cpp #ifndef ATVC_REDUCE_SUM_COMPUTE_H #define ATVC_REDUCE_SUM_COMPUTE_H @@ -327,47 +333,91 @@ Reduce类的计算模板涉及多核之间的数据结果同步以及核内分 #include "reduce/common/patterns.h" #include "reduce/utils/reduce_block_aux_util.h" - namespace ATVC { -// OpTraits: 算子描述的ATVC::OpTraits结构体 +/*! + * ReduceSumCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ template -// 计算模板,不感知数据从GM到UB上的搬运 class ReduceSumCompute { public: - // 从OpTraits中萃取算子输入描述信息 + // Extract operator input description information from OpTraits using inputDTypeList = typename OpTraits::In::types; using DataType = typename ATVC::TypeListGet::Type; using PrompteDtype = typename KernelUtils::GetPromoteType::T; - __aicore__ inline ReduceSumCompute() {} - - // 模板参数Pattern由上层传入 - template - __aicore__ inline void operator()(KernelUtils::Shape<2>& shape, const LocalTensor& dst, const LocalTensor& src) { - // 计算单基块数据ReduceSum的结果 + __aicore__ inline ReduceSumCompute() + {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, True when UB alignment introduced invalid lanes. + * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, Destination tensor (length == dimA) + * \param[in] src, Source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // ... } + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, Logical index identifying the data-base block. + * \param[in] shape, Runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ template - __aicore__ inline void UpdateCache(int64_t index, V& shape, const LocalTensor& tempBuf, const LocalTensor& computeRes) { - // 合并单UB内不同数据基块的计算结果 + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + // ... } - __aicore__ inline void ReduceBetweenUB(const LocalTensor& ubTensorLeft, - const LocalTensor& ubTensorRight, const int32_t& calCount) { - // 将多核之间&同一核内的多次UB搬运的结果做Reduce操作 + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, Left operand (in-place result). + * \param[in] ubTensorRight, Right operand (read-only). + * \param[in] calCount, Number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + // ... } + /*! + * \brief Return the value used for padding when UB alignment is required. + * For SUM-reduction the neutral element is 0. + * \tparam U, Scalar type identical to DataType or PromoteDataType. + * \return The padding value (always 0). + */ template - __aicore__ inline U GetPaddingValue() { - // 涉及不同Reduce类别的Tile对齐操作,与数据大小无关 + __aicore__ inline U GetPaddingValue() + { + // ... } }; -} // namespace ATVC -#endif // ATVC_REDUCE_SUM_COMPUTE_H +} // namespace ATVC +#endif // ATVC_REDUCE_SUM_COMPUTE_H ``` Reduce计算模板类将在数据计算阶段被`ReduceOpTemplate`算子模板调用,因此Reduce计算模板类的实现必须遵从以下约束: -- 该模板类在实例化时固定传入ATVC::OpTraits类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 +- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 - 开发必须完成以下公有API的内部实现: 1. 计算单数据基块的Reduce结果 `__aicore__ inline void Compute(...)` 2. 计算单UB内不同数据基块的计算结果 `__aicore__ inline void UpdateCache(...)` @@ -387,20 +437,33 @@ struct ReducePolicy { }; ``` -下方为`ATVC::Kernel::ReduceOpTemplate`模板类的外部接口介绍,完整模板类定义请参考`atvc/include/reduce/reduce_op_template.h`。 +下方为`ATVC::Kernel::ReduceOpTemplate`模板类的外部接口介绍,完整模板类定义请参考[reduce模板](../include/reduce/reduce_op_template.h)。 ```cpp -template +/*! + * ReduceOpTemplate Generic Reduce operator template. + * Reduce operators usually refer to operators that perform reduction operations on elements in tensors, + * such as summation and averaging. They can specify several dimensions for reduction calculations, + * or reduce all elements to a scalar. + */ +template class ReduceOpTemplate { public: - __aicore__ inline ReduceOpTemplate(){}; - - // 按照输入、输出、EleWiseParam、其他标量的顺序传入 - // 内部根据EleWiseParam进行数据调度并调用ReduceOpTemplate完成计算后搬出到GM - template - __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam* param) { - // - // Reduce类算子Run接口按输入、输出、运行态参数param顺序传入 - // + __aicore__ inline ReduceOpTemplate() {}; + + /*! + * \brief The input order is: input tensor, output tensor, ReduceParam, Other scalars. + * Internally schedule data based on ReduceParam and call ReduceOpTemplate to complete + * the calculation before moving it out to GM. + * \param[in] x, GM address of the input tensor. + * \param[in] y, GM address of the output tensor. + * \param[in] param, tiling data and policy. + * \return void. + */ + template + __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param) + { + // ... } }; ``` @@ -487,27 +550,22 @@ struct ReduceParam { `ATVC::Host::CalcReduceTiling`函数内部提供了影响Tiling算法的超参数结构体`ReduceTilingHyperParam`,支持开发通过修改超参值来探索更好的算子性能。该API调用样例如下,详细代码请参考[Reduce Tiling 算法](../include/reduce/reduce_host.h): ```cpp -/** - * @brief 计算Reduce的TilingData和策略参数 - * @param inputShape 输入张量的形状。 - * @param reduceDim 需要进行Reduce操作的具体维度。 - * @param policy 输出参数。 - * @param param 输出参数。 - * @return bool 返回true表示计算成功,false表示失败。 +/*! + * \brief Calculate the TilingData and policy parameters for Reduce. + * \param[in] inputShape, shape of the tensor. + * \param[in] reduceDim, The dim that requires a Reduce operation. + * \param[out] policy, static policy of Reduce Template + * \param[out] param, dynamic param of Reduce Template + * \return bool Return true to indicate calculation success, false to indicate failure. */ -// 传入编译态参数ATVC::OpTraits,函数内部将萃取该模板参数获取算子信息 -template -bool CalcReduceTiling(std::vector inputShape, std::vector reduceDim, ReducePolicy* policy, ReduceParam* param) +template +bool CalcReduceTiling(std::vector inputShape, + std::vector reduceDim, + ReducePolicy* policy, + ReduceParam* param, + ReduceTilingHyperParam hyperParam = ReduceTilingHyperParam()) { - // 影响Tiling算法的超参数结构体,支持用户修改来获取更高性能收益 - struct ReduceTilingHyperParam { - int32_t basicBlock = 16 * 1024; // 最大为UB的总大小的1/3 - int nBufferNum = 2; // 每个Queue中的Tensor数量,推荐在[2, 3]的候选址内变动 - }; - // 萃取Traits得到输入输出等类型和个数的信息 - // 计算最终TilingData并填入param中 // ... - return true; } // Host侧调用示例 @@ -599,13 +657,13 @@ Broadcast模板算子内部根据数据类型、输入/输出shape完成某个 ![broadcast_components.png](images/broadcast_components.png) 自定义Broadcast算子需按照以下顺序完成模块之间的组装: 1. 自定义计算模板/使用框架内置计算模板 -2. 将计算模板传入`Kernel`层模板算子完成核函数功能实现; +2. 将计算模板传入Kernel层模板算子完成核函数功能实现; 3. 定义Kernel层算子入口API,内部实例化计算模板类; 下面将以BroadcastTo(对输入tensor在特定轴上做数据复制扩充操作)的算子搭建为样例,按照组装顺序介绍Broadcast类算子的开发流程。 ### 2.3.2 计算模板 Broadcast计算模板是指Broadcast类算子在UB上实现将A轴的数据复制扩充到B轴上。在Kernel层的组装阶段,计算模板将作为模板参数传入`ATVC::Kernel::BroadcastOpTemplate`,并在数据计算阶段被调用。下方为对齐的代码样例: -下方为ATVC框架内置的`ATVC::BroadcastCompute`计算模板的伪代码介绍,完整代码定义请参考`atvc/include/broadcast/broadcast_compute.h`。 +下方为ATVC框架内置的`ATVC::BroadcastCompute`计算模板的伪代码介绍,完整代码定义请参考[完整代码](../include/broadcast/broadcast_compute.h)。 ```cpp #ifndef ATVC_BROADCAST_COMPUTE_H #define ATVC_BROADCAST_COMPUTE_H @@ -643,7 +701,7 @@ Broadcast计算模板类将在数据计算阶段被`BroadcastOpTemplate`算子 * AB场景的计算:输入`src`是一个shape为(dimA, 1)的Tensor,需要将数据扩充到`dst`上,dst的shape是(dimA, dimB); * BA场景的计算:输入`src`是一个shape为(1, dimA)的Tensor,需要将src数据扩充到`dst`上,dst的shape是(dimB, dimA); -- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如` ATVC::OpTraits,ATVC::OpOutputs`。 +- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 ### 2.3.3 内置Broadcast算子模板 `ATVC::Kernel::BroadcastOpTemplate`是一套基本的Broadcast算子类,它实现了一套算子数据的搬运搬出、资源分配和释放的流程。Kernel层的算子模板需要计算模板类作为模板参数传入来完成实例化。在调用阶段,Broadcast算子模板将按照固定参数顺序调用计算模板类的`Compute`接口,完成数据的计算。 @@ -666,22 +724,28 @@ struct BroadcastPolicy { #include "broadcast/utils/broadcast_buf_pool.h" namespace ATVC { namespace Kernel { -template +/*! + * BroadcastCompute: Used to implement element wise operations between tensors when their shapes are inconsistent. + * By copying data in a dimension of length 1, the shapes of two tensors are aligned to support element wise + * addition operations. +*/ +template class BroadcastOpTemplate { public: using DataType = typename BroadcastCompute::DataType; - __aicore__ inline BroadcastOpTemplate(){} - - /* - BroadcastOpTemplate对外运行接口,主要完成资源初始化、数据搬入、计算调度、数据搬出操作 - @param src: 输入数据的gm指针 - @param dst: 输出数据的gm指针 - @broadcastParam: broadcast的动态参数,包含tiling data, workspace等 - */ - __aicore__ inline void Run(GM_ADDR src, GM_ADDR dst, ATVC::BroadcastParam* broadcastParam) + __aicore__ inline BroadcastOpTemplate() {} + + /*! + * \brief The external running interface of BroadcastOpTemplate mainly completes resource initialization, + * data migration, calculation scheduling and data migration operations + * \param src, GM pointer for input data + * \param dst, GM pointer for output data + * \param broadcastParam, dynamic parameters of broadcast, including tiling data, workspace, etc + */ + template + __aicore__ inline void Run(Args&&... args) { - this->Init(src, dst, broadcastParam); - this->Process(); + // ... } AscendC::GlobalTensor srcGlobal_; @@ -781,8 +845,19 @@ struct BroadcastParam { ```cpp namespace ATVC { namespace Host { -template -bool CalcBroadcastTiling(std::vector shapeIn, std::vector shapeOut, BroadcastPolicy* policy, BroadcastParam* param) +/*! + * \brief Generates tiling parameters and policy for the Broadcast Template. + * \param[in] shapeIn , Source tensor shape (may be broadcast to match `shapeOut`). + * \param[in] shapeOut, Destination tensor shape after broadcasting. + * \param[out] policy, static policy of Broadcast Template. + * \param[out] param, dynamic param of Broadcast Template. + * \return true – successfully, false – error. + */ +template +bool CalcBroadcastTiling(std::vector shapeIn, + std::vector shapeOut, + BroadcastPolicy* policy, + BroadcastParam* param) { using inputDTypeList = typename OpTraits::In::types; using DataType = typename ATVC::TypeListGet::Type; @@ -889,7 +964,7 @@ ATVC框架支持Broadcast与Elementwise组合的算子通过扩展BroadcastOpTem 2.自定义Broadcast计算模板/使用框架内置Broadcast计算模板,并组合Elementwise计算模板。 -3.将计算模板传入`Kernel`层模板算子完成核函数功能实现; +3.将计算模板传入Kernel层模板算子完成核函数功能实现; 4.定义Kernel层算子入口API, 内部实例化计算模板类; @@ -965,9 +1040,13 @@ Broadcast计算模板在组合算子中与Broadcast单算子无任何区别, Broadcast与Elementwise组合的算子模板以BroadcastOpTemplate为基础进行扩展,`BroadcastOpTemplate`的介绍可以参考章节[2.3.3](#233-内置broadcast算子模板)。下面为组合算子场景`ATVC::Kernel::BroadcastOpTemplate`新引入的接口或定义,以及调用计算模板函数的示意代码,完整模板定义请参考[`atvc/broadcast/broadcast_op_template.h`](../include/broadcast/broadcast_op_template.h): ```cpp +/*! + * BroadcastCompute: Used to implement element wise operations between tensors when their shapes are inconsistent. + * By copying data in a dimension of length 1, the shapes of two tensors are aligned to support element wise + * addition operations. +*/ template class BroadcastOpTemplate { - // 萃取PreCompute和PostCompute中的信息 static constexpr bool HAS_PRE_COMPUTE = !AscendC::Std::is_same_v; static constexpr bool HAS_POST_COMPUTE = !AscendC::Std::is_same_v; using PreComputeTraits = AscendC::Std::conditional_t::ComputeTraits, VoidComputeTraits>; @@ -980,7 +1059,13 @@ class BroadcastOpTemplate { using PostTemp = typename PostComputeTraits::Temp::types; using DataType = typename BroadcastCompute::DataType; - + /*! + * \brief Late parameter injection helper. In some fusion scenarios the host side needs to pass additional + * runtime parameters (e.g. fused-activation coefficients) after the kernel has already been launched. + * SetParam allows such parameters to be forwarded to the pre- and/or post-compute functors. + * \param[in] param, Pointer to the BroadcastParam structure (tiling, workspace, etc.) + * \param[in] args, Optional extra parameters consumed by PreCompute / PostCompute + */ template __aicore__ inline void SetParam(BroadcastParam *param, Args... args) { @@ -991,8 +1076,15 @@ class BroadcastOpTemplate { postCompute_.SetParam(args...); } } - // Run接口参数改为可变参数,用来传递PreCompute和PostCompute的参数。 - template + + /*! + * \brief The external running interface of BroadcastOpTemplate mainly completes resource initialization, + * data migration, calculation scheduling, and data migration operations + * \param src, GM pointer for input data + * \param dst, Gm pointer for outputting data + * \param broadcastParam, Dynamic parameters of broadcast, including tiling data, workspace, etc + */ + template __aicore__ inline void Run(Args&&... args) { // 分拣出tensor参数并按使用个数传递给计算函数 @@ -1003,23 +1095,6 @@ class BroadcastOpTemplate { // 分拣出scaler参数并传给PreCompute和PostCompute SetParam(scalerArgs); } - - __aicore__ inline void CopyIn(AscendC::LocalTensor &input, uint32_t copyInOffset, BroadcastDataView &view) - { - if constexpr(HAS_PRE_COMPUTE) { - ProcessPreCompute(input, copyInOffset, copyInParams); - return; - } - } - - __aicore__ inline void CopyOut(AscendC::LocalTensor &output, - uint32_t copyOutOffset, BroadcastDataView &view) - { - if constexpr(HAS_POST_COMPUTE) { - ProcessPostCompute(output, copyOutOffset, copyOutParams); - return; - } - } }; ``` @@ -1147,8 +1222,8 @@ __aicore__ inline void DebugPrintf(__gm__ const char* fmt, Args&&... args); } } // 调用示例 -ATVC::Kernel::DebugPrintf("[ERROR]:[ATVC][EleWise] Input Count can not be 0!\n"); -ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n", curCoreStartCnt_ + offsetCnt_, calcCnt_); +ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Input Count can not be 0!\n"); +ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n", curCoreStartCnt_ + offsetCnt_, calcCnt_); ``` ## 3.2.2 开启Profiling性能调优功能 @@ -1178,11 +1253,11 @@ ElementWise模板通用Tiling切分的数据结构为EleWiseTilingData,描述 ```cpp namespace ATVC { struct EleWiseTilingData { - uint32_t tailBlockCnt; // 需要额外执行一次循环的核的数量 - uint32_t tailElemCnt; // 尾块元素数量 - uint32_t numPerBlock; // 每个核需计算的基本块个数 - uint32_t tiledCnt; // 基本块元素个数 - uint32_t blockNum; // 执行核数 + uint32_t tailBlockCnt; // The number of cores that need to execute an additional loop + uint32_t tailElemCnt; // The number of tail block elements + uint32_t numPerBlock; // The number of basic blocks to be calculated for each core + uint32_t tiledCnt; // The number of basic block elements + uint32_t blockNum; // Execute audit }; } ``` @@ -1203,12 +1278,12 @@ struct EleWiseTilingData { namespace ATVC { namespace Host { struct EleWiseTilingHyperParam { - uint32_t singleCoreBaseLine = 512; // 数据量基线:核内数据量超过基线就分多核直至满核, 设置范围: [256, 128 * 1024] - float ubSizeLimitThreshold = 0.95f; // UB内存使用上限,决定了basicBlock最大值 - uint32_t nBufferNum = 2; // multi buffer 设置值为: [1, 2] - uint32_t splitDataShape[MAX_SHAPE_NODE] = {1024, 32 * 1024, 64 * 1024}; // 数据分段节点 - uint32_t dataSplitFactor[MAX_SHAPE_NODE + 1] = {4, 4, 8, 6}; // 对应数据分段内的数据量的切分系数 - uint32_t rsvLiveCnt = 0; // 额外的存活节点数,表示内部需要申请空间个数,可设置的范围为[0, 1] + uint32_t singleCoreBaseLine = 512; // data volume baseline for a core, the valid setting range: [256, 128 * 1024] + float ubSizeLimitThreshold = 0.95f; // UB memory usage upper limit,determines the maximum value of basicBlock + uint32_t nBufferNum = 2; // The number of parallelism buffer, the valid setting range: [1, 2] + uint32_t splitDataShape[MAX_SHAPE_NODE] = {1024, 32 * 1024, 64 * 1024}; // Segmentation nodes for shape + uint32_t dataSplitFactor[MAX_SHAPE_NODE + 1] = {4, 4, 8, 6}; // The split coefficient for each segmentation nodes + uint32_t rsvLiveCnt = 0; // Additional surviving nodes, means to reserve a portion of UB space. }; } } @@ -1248,20 +1323,22 @@ ATVC host 和kernel侧都会使用到的`ReduceTilingData`是Reduce的核间AR ```cpp namespace ATVC { struct ReduceTilingData { - uint64_t factorACntPerCore; // 在每个核上不参与计算的非Reduce轴实际维度 - uint64_t factorATotalCnt; // 不参与计算的非Reduce轴总维度 - uint64_t ubFactorA; // 单UB内非Reduce轴的数据量 - uint64_t factorRCntPerCore; // 在每个核上参与计算的Reduce轴实际维度 - uint64_t factorRTotalCnt; // 参与计算的Reduce轴总维度 - uint64_t ubFactorR; // 单UB内参与计算的Reduce轴维度 - uint64_t groupR; // 切轴为R轴,该轴上切点外的R的相对数据量 - uint64_t outSize; // 切轴外的AR数据总量 - uint64_t basicBlock; // 基础数据块大小 - int32_t coreNum; // 执行核数 - float meanVar; // 预留信息,暂不使用 - uint64_t shape[MAX_DIM]; // shape信息 - uint64_t stride[MAX_DIM]; // 输入数据搬运步长 - uint64_t dstStride[MAX_DIM]; // 输出数据搬运步长 + uint64_t factorACntPerCore; // The actual dimensions of non Reduce axes that do not participate + // in computation on each core + uint64_t factorATotalCnt; // The total dimension of non Reduce axes that do not participate in the calculation + uint64_t ubFactorA; // The amount of data on non Reduce axes within a single UB + uint64_t factorRCntPerCore; // The actual dimension of the Reduce axis involved in computation on each core + uint64_t factorRTotalCnt; // The total dimension of the Reduce axis involved in the calculation + uint64_t ubFactorR; // Reduce axis dimension involved in calculation within a single UB + uint64_t groupR; // The tangent axis is the R axis, and the relative data amount of R + // outside the tangent point on this axis + uint64_t outSize; // The total amount of AR data outside the cutting axis + uint64_t basicBlock; // The basic data block size + int32_t coreNum; // The number of running cores + float meanVar; // Reserved + uint64_t shape[MAX_DIM]; // Shape info + uint64_t stride[MAX_DIM]; // Input data transfer step size + uint64_t dstStride[MAX_DIM]; // Output data transfer step size }; } ``` @@ -1269,7 +1346,7 @@ struct ReduceTilingData { 可调参数如下所示: | Tiling超参名 | 数据类型 | 参数说明 | 调节范围 | 默认值 | | ----------- | -------------- | ----------- | ----------- |---| -| basicBlock | uint32_t | Reduce 基本块内存大小 | 不能超过UB内存的1/3, 192K内存 建议在48K-54K之间设置 | 54 * 1024| +| basicBlock | uint32_t | Reduce 基本块内存大小 | 不能超过UB内存的1/3, 192K内存 建议在48K-54K之间设置 | 48 * 1024| | maxInnerA | uint32_t |AR切轴内A轴的最大数据量 | [128, 256] | 128 | | balanceThreshHold | double| 多核均衡的阈值水平, 阈值越高,切分后每个核处理的数据量越均衡 | [0.8, 0.95]| 0.85 | @@ -1278,7 +1355,8 @@ struct ReduceTilingData { namespace ATVC { namespace Host { struct ReduceTilingHyperParam { - // Set the basic block memory size for Reduce, generally not exceeding 1/3 of the memory. + // Set the basic block memory size for Reduce, generally not exceeding 1/3 of the memory. It is recommended to set + // it between [48k-54k] uint32_t basicBlock = 48 * 1024; uint32_t maxInnerA = 128; // [128, 256] double balanceThreshHold = 0.85; // Threshold level for multi-core equilibrium [0.8-0.95] diff --git a/atvc/docs/03_code_organization.md b/atvc/docs/03_code_organization.md index f6f60e5f..753745a8 100644 --- a/atvc/docs/03_code_organization.md +++ b/atvc/docs/03_code_organization.md @@ -59,9 +59,9 @@ examples │ ├── add │ └── reduce_sum ├── ops_pytorch // PyTorch框架调用样例 -│ ├── README.md -│ ├── add -│ └── reduce_sum +| ├── README.md +| ├── add +| └── reduce_sum └── common // 算子样例公共接口 ``` diff --git a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp index 5ae9b3a2..ec3d12e0 100644 --- a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp +++ b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp @@ -47,26 +47,18 @@ void BroadcastOpAdapter(uint8_t* x, uint8_t* y, uint8_t* z, ATVC::BroadcastParam CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtFree(workspaceDevice)); } -} - -int32_t main(int32_t argc, char* argv[]) +void InitializeData(int32_t eleNum, int32_t outEleNum, std::vector &inputX, std::vector &inputY, + std::vector &golden) { - int32_t eleNum = 1 * 2048; - int32_t outEleNum = 8 * 2048; - std::vector shapeIn{1, 2048}; // 测试输入shape - std::vector shapeOut{8, 2048}; // 测试输入shape - - size_t inputByteSize = static_cast(eleNum) * sizeof(float); - size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + if (eleNum == 0) { + return; + } std::random_device rd; std::mt19937 gen(static_cast(rd())); std::uniform_real_distribution disX(1.0f, 9.0f); std::uniform_real_distribution disY(1.0f, 9.0f); - std::vector inputX(eleNum); - std::vector inputY(outEleNum); - std::vector golden(outEleNum); for (int i = 0; i < eleNum; ++i) { inputX[i] = (disX(gen)); } @@ -76,13 +68,38 @@ int32_t main(int32_t argc, char* argv[]) for (int i = 0; i < outEleNum; ++i) { golden[i] = (inputX[i % eleNum]) + (inputY[i]); } +} + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice, uint8_t *&zHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(zDevice)); + CHECK_ACL(aclrtFreeHost(zHost)); +} +} + + +int32_t main(int32_t argc, char* argv[]) +{ + int32_t eleNum = 1 * 2048; + int32_t outEleNum = 8 * 2048; + std::vector shapeIn{1, 2048}; // 测试输入shape + std::vector shapeOut{8, 2048}; // 测试输入shape + + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + + std::vector inputX(eleNum); + std::vector inputY(outEleNum); + std::vector golden(outEleNum); + printf("Generate golden data successfully.\n"); // 初始化Acl资源 - CHECK_ACL(aclInit(nullptr)); aclrtContext context; + aclrtStream stream = nullptr; int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); + InitializeACL(context, stream, deviceId); uint8_t *zHost; uint8_t *xDevice; uint8_t *yDevice; @@ -110,15 +127,8 @@ int32_t main(int32_t argc, char* argv[]) CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); std::vector outputZ(reinterpret_cast(zHost), reinterpret_cast(zHost) + outEleNum); - // 释放Acl资源 - CHECK_ACL(aclrtFree(xDevice)); - CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFree(zDevice)); - CHECK_ACL(aclrtFreeHost(zHost)); - - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); + CleanUp(xDevice, yDevice, zDevice, zHost); + CleanACL(stream, context, deviceId); if (!VerifyResults(golden, outputZ)) { return -1; diff --git a/atvc/examples/add_with_scalar/add_with_scalar.cpp b/atvc/examples/add_with_scalar/add_with_scalar.cpp index d56bc6bd..511ea817 100644 --- a/atvc/examples/add_with_scalar/add_with_scalar.cpp +++ b/atvc/examples/add_with_scalar/add_with_scalar.cpp @@ -128,7 +128,7 @@ int main() std::vector outputZ(reinterpret_cast(zHost), reinterpret_cast(zHost) + eleNum); CleanUp(zHost, xDevice, yDevice, zDevice); - CleanACL(context, stream, deviceId); + CleanACL(stream, context, deviceId); if (!VerifyResults(golden, outputZ)) { return -1; diff --git a/atvc/examples/broadcast_to/broadcast_to.cpp b/atvc/examples/broadcast_to/broadcast_to.cpp index 7bd28495..9e864c62 100644 --- a/atvc/examples/broadcast_to/broadcast_to.cpp +++ b/atvc/examples/broadcast_to/broadcast_to.cpp @@ -115,7 +115,7 @@ int32_t main(int32_t argc, char* argv[]) // 释放Acl资源 CleanUp(xDevice, yDevice, yHost); - CleanACL(context, stream, deviceId); + CleanACL(stream, context, deviceId); for (int32_t i = 0; i < outEleNum; i++) { if (!IsClose(golden[i], outputY[i])) { diff --git a/atvc/examples/common/example_common.h b/atvc/examples/common/example_common.h index b10155e5..9fd704bf 100644 --- a/atvc/examples/common/example_common.h +++ b/atvc/examples/common/example_common.h @@ -1,67 +1,67 @@ -/** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. - * - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -#ifndef ATVC_EXAMPLE_COMMON_H -#define ATVC_EXAMPLE_COMMON_H -#include "acl/acl.h" - -namespace { -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) - -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - -bool VerifyResults(const std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("[ERROR]: Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclInit(nullptr)); - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - CHECK_ACL(aclrtCreateStream(&stream)); -} - -void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} -} // namespace - +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_EXAMPLE_COMMON_H +#define ATVC_EXAMPLE_COMMON_H +#include "acl/acl.h" + +namespace { +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0) + +static constexpr float REL_TOL = 1e-3f; +static constexpr float ABS_TOL = 1e-5f; + +bool IsClose(float a, float b) +{ + const float eps = 1e-40f; + float diff = std::abs(a - b); + return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); +} + +bool VerifyResults(const std::vector &golden, const std::vector &output) +{ + for (int32_t i = 0; i < golden.size(); i++) { + if (!IsClose(golden[i], output[i])) { + printf("[ERROR]: Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, + golden[i], + output[i]); + return false; + } + } + return true; +} + +void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} +} // namespace + #endif \ No newline at end of file diff --git a/atvc/examples/sinh_custom/sinh_custom.cpp b/atvc/examples/sinh_custom/sinh_custom.cpp index d8e0914e..800665a7 100644 --- a/atvc/examples/sinh_custom/sinh_custom.cpp +++ b/atvc/examples/sinh_custom/sinh_custom.cpp @@ -118,7 +118,7 @@ int main() CHECK_ACL(aclrtFree(yDevice)); CHECK_ACL(aclrtFreeHost(yHost)); - CleanACL(context, stream, deviceId); + CleanACL(stream, context, deviceId); if (!VerifyResults(golden, outputY)) { return -1; diff --git a/atvc/include/broadcast/broadcast_compute.h b/atvc/include/broadcast/broadcast_compute.h index dbd86ecd..7c079219 100644 --- a/atvc/include/broadcast/broadcast_compute.h +++ b/atvc/include/broadcast/broadcast_compute.h @@ -16,6 +16,10 @@ #include "broadcast/common/broadcast_common.h" namespace ATVC { +/** + * BroadcastCompute: Device-side template that performs the actual broadcast operation + * on LocalTensor data. + */ template class BroadcastCompute { public: @@ -62,7 +66,7 @@ private: __aicore__ inline void ComputeABByBrcbCopy(AscendC::LocalTensor& src, uint32_t inputOffset, AscendC::LocalTensor& dst, uint32_t dimA, uint32_t dimB) { - constexpr uint32_t brcbProcCnt = 8; // 一次brcb 处理8个元素 + constexpr uint32_t brcbProcCnt = 8; // Process 8 elements with BRCB at once constexpr uint32_t dSize = sizeof(DataType); AscendC::BrcbRepeatParams repeatParam(dimB * dSize / ATVC::UB_ALIGN_32, brcbProcCnt * dimB * dSize / ATVC::UB_ALIGN_32); @@ -70,13 +74,14 @@ private: uint32_t i = brcbProcCnt; uint16_t step; while (i < dimB) { - step = i * 2 > dimB ? (dimB - i) : i; // 2: 每次循环 将已拷贝长度为i的元素拷贝到下一个dst,要保证不超出dimB + // 2: Each iteration copies the copied element of length i to the next dst, ensuring that it does not exceed dimB + step = i * 2 > dimB ? (dimB - i) : i; step = step * dSize / ATVC::UB_ALIGN_32; uint16_t stride = static_cast(dimB * dSize / ATVC::UB_ALIGN_32 - step); AscendC::DataCopyParams repeatParam = {static_cast(dimA), // blockCount [1, 4095] - step, // 单位为32B - stride, // 取值范围不能超uint16_t - stride}; // 取值范围不能超uint16_t + step, // The unit is 32B + stride, // The value range cannot exceed uint16ut + stride}; // The value range cannot exceed uint16ut AscendC::DataCopy(dst[i], dst, repeatParam); i = i + step * ATVC::UB_ALIGN_32 / dSize; AscendC::PipeBarrier(); diff --git a/atvc/include/broadcast/broadcast_host.h b/atvc/include/broadcast/broadcast_host.h index da1a6adf..3225b5b1 100644 --- a/atvc/include/broadcast/broadcast_host.h +++ b/atvc/include/broadcast/broadcast_host.h @@ -24,6 +24,14 @@ namespace ATVC { namespace Host { +/*! + * \brief Generates tiling parameters and policy for the Broadcast Template. + * \param[in] shapeIn, source tensor shape (may be broadcast to match `shapeOut`). + * \param[in] shapeOut, destination tensor shape after broadcasting. + * \param[out] policy, static policy of Broadcast Template. + * \param[out] param, dynamic param of Broadcast Template. + * \return true – successfully, false – error. + */ template bool CalcBroadcastTiling(std::vector shapeIn, std::vector shapeOut, diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h index 1ec0d6b1..d4c1f7b0 100644 --- a/atvc/include/broadcast/broadcast_op_template.h +++ b/atvc/include/broadcast/broadcast_op_template.h @@ -28,16 +28,21 @@ struct BroadcastDataView { uint32_t dimBSize; uint32_t inShape[ATVC::MAX_DIM]; uint32_t outShape[ATVC::MAX_DIM]; - uint32_t copyInSize; // 单核拷入数据量 - uint32_t A11; // 实际参与计算的A11 - uint32_t A12; // 实际参与计算的A12 - uint32_t B1; // 实际参与计算的B1 - uint32_t dimAOffset; // 输入输出数据在A维度的偏移量 - uint32_t dimBOffset; // 输入输出数据在B维度的偏移量 - uint32_t copyOutBaseOffset; // 核间数据拷出基址 + uint32_t copyInSize; // Single core copying data volume + uint32_t A11; // A11 actually involved in the calculation + uint32_t A12; // A12 actually participated in the calculation + uint32_t B1; // B1 who actually participated in the calculation + uint32_t dimAOffset; // The offset of input and output data in dimension A + uint32_t dimBOffset; // The offset of input and output data in the B dimension + uint32_t copyOutBaseOffset; // Copy the base address of inter nuclear data }; namespace Kernel { +/*! + * BroadcastCompute: Used to perform element operations between tensors when their shapes are inconsistent. + * By copying data in a dimension of length 1, the shapes of two tensors are aligned to support element wise + * addition operations. +*/ template class BroadcastOpTemplate { public: @@ -67,12 +72,13 @@ public: __aicore__ inline BroadcastOpTemplate() {} - /* - BroadcastOpTemplate对外运行接口,主要完成资源初始化、数据搬入、计算调度、数据搬出操作 - @param src: 输入数据的gm指针 - @param dst: 输出数据的gm指针 - @broadcastParam: broadcast的动态参数,包含tiling data, workspace等 - */ + /*! + * \brief The external running interface of BroadcastOpTemplate mainly completes resource initialization, + * data migration, calculation scheduling and data migration operations + * \param src, GM pointer for input data + * \param dst, GM pointer for output data + * \param broadcastParam, dynamic parameters of broadcast, including tiling data, workspace, etc + */ template __aicore__ inline void Run(Args&&... args) { @@ -96,6 +102,13 @@ public: ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] End to run template function.\n"); } + /*! + * \brief Late parameter injection helper. In some fusion scenarios the host side needs to pass additional + * runtime parameters (e.g. fused-activation coefficients) after the kernel has already been launched. + * SetParam allows such parameters to be forwarded to the pre- and/or post-compute functors. + * \param[in] param, pointer to the BroadcastParam structure (tiling, workspace, etc.) + * \param[in] args, optional extra parameters consumed by PreCompute / PostCompute + */ template __aicore__ inline void SetParam(BroadcastParam *param, Args... args) { @@ -150,10 +163,10 @@ private: outputCount_ = PostInputCount + PostTempCount + PostOutputCount; } bufPool_.template Init(GetTPipePtr(), - inputCount_, // doublebuff需要的输入个数 - outputCount_, // 计算结果的个数,一般与inputNum保持一致 - tilingData_->A2 * tilingData_->A12 * DATA_SIZE, // 输入Tensor大小 - tilingData_->A2 * tilingData_->B2 * DATA_SIZE); // 输出Tensor大小 + inputCount_, // The number of inputs required for double buffer + outputCount_, // The number of calculation results is generally consistent with inputNum + tilingData_->A2 * tilingData_->A12 * DATA_SIZE, // Input Tensor size + tilingData_->A2 * tilingData_->B2 * DATA_SIZE); // Output Tensor Size } template @@ -261,7 +274,8 @@ private: return; } if (copyInOffset + view.copyInSize > view.dimASize) { - // 剩下的数据不够一次完整计算, 根据实际数据重新计算 + // If the remaining data is not sufficient for a complete calculation, + // recalculate based on the actual data view.copyInSize = view.dimASize - copyInOffset; view.A12 = OpsUtils::CeilDiv(view.copyInSize, tilingData_->A2); } @@ -292,19 +306,19 @@ private: __aicore__ inline uint32_t CalcCopyOutBaseOffset(BroadcastDataView &view) { uint32_t copyOutBaseOffset = 0; - // 计算拷出偏移基址 + // Calculate the base offset for copying out if (SelectBroadcastPolicy.patternID == AB_PATTERN::ABA) { - if (tilingData_->A0 != 1) { // 核间A切分, 取部分A + if (tilingData_->A0 != 1) { // If A is split across cores, add the partial-A offset. copyOutBaseOffset += view.dimAOffset; } - if (tilingData_->B0 != 1) { // 核间B切分,取部分B + if (tilingData_->B0 != 1) { // If B is split across cores, add the partial-B offset. copyOutBaseOffset += view.dimBOffset * view.dimASize; } } else { - if (tilingData_->A0 != 1) { // 核间A切分, 取部分A + if (tilingData_->A0 != 1) { // If A is split across cores, add the partial-A offset. copyOutBaseOffset += view.dimAOffset * view.dimBSize; } - if (tilingData_->B0 != 1) { // 核间B切分,取部分B + if (tilingData_->B0 != 1) { // If B is split across cores, add the partial-B offset. copyOutBaseOffset += view.dimBOffset; } } @@ -336,10 +350,10 @@ private: uint32_t dimBIdx = blockId % tilingData_->factorBTotalCnt; view.dimAOffset = dimAIdx * tilingData_->factorACntPerCore; view.dimBOffset = dimBIdx * tilingData_->factorBCntPerCore; - // 计算一次计算的输入数据大小 - view.copyInSize = view.A12 * tilingData_->A2; // 一次拷贝A12份数据, for循环计算A12次 + view.copyInSize = view.A12 * tilingData_->A2; if (view.dimAOffset + tilingData_->factorACntPerCore > view.dimASize) { - // 剩下的A维度的数据不够每个核分到的A数目,重新计算实际的A维度切分 + // The remaining data in the A dimension is not enough to allocate the number of A values to each kernel. + // Recalculate the actual A dimension segmentation. uint32_t realShape = view.dimASize - view.dimAOffset; uint32_t dimA1 = OpsUtils::CeilDiv(realShape, tilingData_->A2); if (dimA1 < view.A12) { @@ -369,7 +383,7 @@ private: return; } AscendC::DataCopyPad(input, srcGlobal_[copyInOffset], copyInParams, padParams); - ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast][CopyIn] Offset is %u, block len is %u " + ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast][CopyIn] Offset is %u, block len is %u " "block count is %u.\n", copyInOffset, copyInParams.blockLen, copyInParams.blockCount); } @@ -385,13 +399,13 @@ private: copyOutParams.srcStride = 0; if (view.outShape[1] + copyOutOffset % dstShape > dstShape) { - // 列非对齐, 按实际数据拷贝 + // Column is not aligned, copy according to actual data copyOutParams.srcStride = OpsUtils::CeilAlign(view.outShape[1], UB_ALIGN_COUNT) * DATA_SIZE; copyOutParams.blockLen = (dstShape - copyOutOffset % dstShape) * DATA_SIZE; copyOutParams.srcStride = (copyOutParams.srcStride - copyOutParams.blockLen) / ATVC::UB_ALIGN_32; } if (view.outShape[0] + copyOutOffset / dstShape > tilingData_->dstShape[0]) { - // 行非对齐, 按实际数据拷贝 + // Rows are not aligned, copy according to actual data copyOutParams.blockCount = (tilingData_->dstShape[0] - copyOutOffset / dstShape); } copyOutParams.dstStride = dstShape * DATA_SIZE - copyOutParams.blockLen; @@ -402,7 +416,7 @@ private: return; } AscendC::DataCopyPad(dstGlobal_[copyOutOffset], output, copyOutParams); - ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast][CopyOut] Offset is %u, block len is %u block count is %u.\n", + ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast][CopyOut] Offset is %u, block len is %u block count is %u.\n", copyOutOffset, copyOutParams.blockLen, copyOutParams.blockCount); } diff --git a/atvc/include/broadcast/common/patterns.h b/atvc/include/broadcast/common/patterns.h index 57e27a28..867a4f88 100644 --- a/atvc/include/broadcast/common/patterns.h +++ b/atvc/include/broadcast/common/patterns.h @@ -35,14 +35,10 @@ struct PatternConstInfo { constexpr static int32_t Dim = dim; }; -struct A : public PatternConstInfo { -}; -struct BA : public PatternConstInfo { -}; -struct AB : public PatternConstInfo { -}; -struct ABA : public PatternConstInfo { -}; -} -} -#endif // ATVC_BROADCAST_PATTERNS_H \ No newline at end of file +struct A : public PatternConstInfo {}; +struct BA : public PatternConstInfo {}; +struct AB : public PatternConstInfo {}; +struct ABA : public PatternConstInfo {}; +} // namespace BroadcastPattern +} // namespace ATVC +#endif // ATVC_BROADCAST_PATTERNS_H \ No newline at end of file diff --git a/atvc/include/broadcast/tiling/broadcast_tiling.h b/atvc/include/broadcast/tiling/broadcast_tiling.h index 09bf4f55..e73dca97 100644 --- a/atvc/include/broadcast/tiling/broadcast_tiling.h +++ b/atvc/include/broadcast/tiling/broadcast_tiling.h @@ -56,8 +56,9 @@ public: { compileInfo_ = ATVC::GetOpCompileInfo(); /* Built-in tiling only support allocate unified memory evenly, so all we need to know is the definition of - the complete operator. If you wants to allocate UB memory unevenly, you need to know the definitions of - broadcast, pre-compute, post-compute separately, and extend tiling to support non-uniform distribution. */ + * the complete operator. If you wants to allocate UB memory unevenly, you need to know the definitions of + * broadcast, pre-compute, post-compute separately, and extend tiling to support non-uniform distribution. + */ broadcast_basic_num_ = BROADCAST_UB_NUM * param->nBufferNum; } @@ -168,13 +169,13 @@ private: if (!haveB) { shapeIn.emplace_back(oriShapeIn[i]); shapeOut.emplace_back(oriShapeOut[i]); - } else { // 连续B轴 + } else { // Continuous B-axis shapeIn.back() = 1; shapeOut.back() *= oriShapeOut[i]; } isCurB = true; haveB = true; - } else { // A轴 + } else { // A-axis if (isCurB && haveA) { printf("[ERROR]: [ATVC][Broadcast] Only support AB/BA!\n"); return false; @@ -182,7 +183,7 @@ private: if (!haveA) { shapeIn.emplace_back(oriShapeIn[i]); shapeOut.emplace_back(oriShapeOut[i]); - } else { // 连续A轴 + } else { // Continuous A-axis shapeIn.back() *= oriShapeIn[i]; shapeOut.back() *= oriShapeOut[i]; } @@ -294,13 +295,13 @@ private: bool CalcSplitParam(const std::vector& shape) { /* - BASIC_BLOCK = UB_TOTAL / 4 根据UB总大小动态分配,输入2份输出2份 - A2 B2 32B对齐 - A2 * B2 * sizeof(T) <= BASIC_BLOCK - A2 * A12 * size(T) <= BASIC_BLOCK - AB场景:B2尽量大 - BA场景:A2尽量大 - */ + * BASIC_BLOCK = UB_TOTAL / 4 Dynamically allocate based on the total size of UB, with 2 inputs and 2 outputs + * A2 B2 32B alignment + * A2 * B2 * sizeof(T) <= BASIC_BLOCK + * A2 * A12 * size(T) <= BASIC_BLOCK + * AB scenario: B2 should be as large as possible + * BA scenario: A2 should be as large as possible + */ uint64_t basicBlock = CalcBasicBlock(); uint64_t dSize = ge::GetSizeByDataType(opInput_.inputDtype); if (dSize == 0) { @@ -311,19 +312,19 @@ private: uint64_t cacheSize = OpsUtils::FloorDiv(basicBlock, dSize); uint32_t dimA = Pattern::TailA ? Pattern::Dim - 1 : Pattern::Dim - 2; // A uint32_t dimB = Pattern::TailA ? Pattern::Dim - 2 : Pattern::Dim - 1; // B - uint64_t i = OpsUtils::FloorAlign(shape[dimA], dUint); // 32B对齐 - uint64_t j = OpsUtils::FloorAlign(shape[dimB], dUint); // 32B对齐 + uint64_t i = OpsUtils::FloorAlign(shape[dimA], dUint); // 32B alignment + uint64_t j = OpsUtils::FloorAlign(shape[dimB], dUint); // 32B alignment ATVC::BroadcastOpTilingData& tilingData = param_->tilingData; - if constexpr (Pattern::TailA) { // 优先A轴打满 - tilingData.B2 = dUint; // B2最小值 + if constexpr (Pattern::TailA) { // Priority A-axis + tilingData.B2 = dUint; // B2 min value tilingData.A2 = i > OpsUtils::FloorDiv(cacheSize, dUint) ? OpsUtils::FloorDiv(cacheSize, dUint) : i; tilingData.B2 = OpsUtils::FloorAlign(OpsUtils::FloorDiv(cacheSize, tilingData.A2), dUint); if (tilingData.B2 > j) { tilingData.B2 = j; } - } else { // 优先B轴打满 - tilingData.A2 = dUint; // A2最小值 + } else { // Priority B-axis + tilingData.A2 = dUint; // A2 min value tilingData.B2 = j > OpsUtils::FloorDiv(cacheSize, dUint) ? OpsUtils::FloorDiv(cacheSize, dUint) : j; tilingData.A2 = OpsUtils::FloorAlign(OpsUtils::FloorDiv(cacheSize, tilingData.B2), dUint); if (tilingData.A2 > i) { @@ -331,32 +332,32 @@ private: } } - // 1.优先多核 A0 B0打满核后再计算核内循环 + // 1. Prioritize multi-core A0 B0 to fill up the kernel before calculating the intra kernel loop tilingData.A0 = OpsUtils::CeilDiv(shape[dimA], tilingData.A2); tilingData.B0 = OpsUtils::CeilDiv(shape[dimB], tilingData.B2); - // A0*B0为实际的block num 必须小于vectorCoreNum + // A0 * B0 is the actual block num, which must be less than vectorCoreNum while (tilingData.A0 * tilingData.B0 > compileInfo_.vectorCoreNum) { - if (tilingData.B0 > 1) { // 优先A0切轴 + if (tilingData.B0 > 1) { // Priority A0 axis cutting --tilingData.B0; } else { --tilingData.A0; } } - // 2.核内循环优先A12,因为A12只需要copyIn 1次 + // 2.Kernel loop prioritizes A12, as A12 only needs to copy In once tilingData.A12 = OpsUtils::CeilDiv(shape[dimA], tilingData.A2 * tilingData.A0); if (tilingData.A12 * tilingData.A2 > cacheSize) { tilingData.A12 = OpsUtils::FloorDiv(cacheSize, tilingData.A2); } tilingData.A11 = - OpsUtils::CeilDiv(shape[dimA], (tilingData.A0 * tilingData.A2 * tilingData.A12)); // 计算精确A11 + OpsUtils::CeilDiv(shape[dimA], (tilingData.A0 * tilingData.A2 * tilingData.A12)); // Calculate Accurate A11 tilingData.B1 = OpsUtils::CeilDiv(shape[dimB], (tilingData.B0 * tilingData.B2)); - // 3.最后重新计算A0 B0避免空核 + // 3.Finally recalculate A0 B0 to avoid empty nuclei tilingData.A0 = OpsUtils::CeilDiv(shape[dimA], tilingData.A2 * tilingData.A11 * tilingData.A12); tilingData.B0 = OpsUtils::CeilDiv(shape[dimB], tilingData.B2 * tilingData.B1); - // 4.写Tiling结果 + // 4.Write Tiling Results ExpandTilingParam(basicBlock); return CheckTilingParam(shape[dimA], shape[dimB]); } diff --git a/atvc/include/broadcast/utils/broadcast_buf_pool.h b/atvc/include/broadcast/utils/broadcast_buf_pool.h index db1a3620..02161524 100644 --- a/atvc/include/broadcast/utils/broadcast_buf_pool.h +++ b/atvc/include/broadcast/utils/broadcast_buf_pool.h @@ -28,21 +28,21 @@ struct BrcPoolManagerUnit { int32_t offset = 0; }; -template +template class BroadcastBufPool { constexpr static int32_t MAX_INPUT_SIZE = 10; public: __aicore__ inline BroadcastBufPool() {}; - template - __aicore__ inline void Init(AscendC::TPipe* pipeIn, - int32_t inputNum, // doublebuff需要的输入个数 - int32_t computeNum, // 计算结果的个数,一般与inputNum保持一致 - int32_t inBlockLen, // 一次计算的输入基本块大小 - int32_t outBlockLen) + template + __aicore__ inline void Init(AscendC::TPipe *pipeIn, + int32_t inputNum, // The number of inputs required for doublebuff + int32_t computeNum, // The number of calculation results is generally consistent with inputNum + int32_t inBlockLen, // Basic input block size for one calculation + int32_t outBlockLen) { - // 一次计算的输出大小 + // Basic input block size for one calculation /* _______________________________________________________________________________________________________ | inputTensor 0 | inputTensor 1 | outputTensor 0 | outputTensor 0 | @@ -68,7 +68,7 @@ public: } template - __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor) + __aicore__ inline const void AllocTensor(AscendC::LocalTensor &tensor) { if constexpr (IsInput) { int32_t idx = GetInputTensorId(); @@ -80,16 +80,16 @@ public: } template - __aicore__ inline const void FreeTensor(AscendC::LocalTensor& tensor) + __aicore__ inline const void FreeTensor(AscendC::LocalTensor &tensor) { if constexpr (!IsInput) { uint32_t idx = GetOutputTensorIdx(tensor); - isBusyOut_[idx] = false; // 恢复isBusy_状态 + isBusyOut_[idx] = false; // Restore isBusy_ state } } template - __aicore__ inline const void SetVecSync(AscendC::LocalTensor& tensor) + __aicore__ inline const void SetVecSync(AscendC::LocalTensor &tensor) { uint32_t idx = GetInputTensorIdx(tensor); event_t eventId = static_cast(pipe_->AllocEventID()); @@ -98,7 +98,7 @@ public: } template - __aicore__ inline const void WaitVecSync(AscendC::LocalTensor& tensor) + __aicore__ inline const void WaitVecSync(AscendC::LocalTensor &tensor) { uint32_t idx = GetInputTensorIdx(tensor); AscendC::WaitFlag(vecEventId_[idx]); @@ -106,7 +106,7 @@ public: } template - __aicore__ inline const void SetCopyOutSync(AscendC::LocalTensor& tensor) + __aicore__ inline const void SetCopyOutSync(AscendC::LocalTensor &tensor) { uint32_t idx = GetOutputTensorIdx(tensor); event_t eventId = static_cast(pipe_->AllocEventID()); @@ -115,7 +115,7 @@ public: } template - __aicore__ inline const void WaitCopyOutSync(AscendC::LocalTensor& tensor) + __aicore__ inline const void WaitCopyOutSync(AscendC::LocalTensor &tensor) { uint32_t idx = GetOutputTensorIdx(tensor); AscendC::WaitFlag(outEventId_[idx]); @@ -123,7 +123,7 @@ public: } template - __aicore__ inline uint32_t GetInputTensorIdx(AscendC::LocalTensor& tensor) + __aicore__ inline uint32_t GetInputTensorIdx(AscendC::LocalTensor &tensor) { uint64_t start = (uint64_t)qQue_.GetWithOffset(inputNum_, 0).GetPhyAddr(); uint64_t offset = (uint64_t)tensor.GetPhyAddr(); @@ -132,7 +132,7 @@ public: } template - __aicore__ inline uint32_t GetOutputTensorIdx(AscendC::LocalTensor& tensor) + __aicore__ inline uint32_t GetOutputTensorIdx(AscendC::LocalTensor &tensor) { uint64_t start = (uint64_t)qQue_.GetWithOffset(outputNum_, inputNum_).GetPhyAddr(); uint64_t offset = (uint64_t)tensor.GetPhyAddr(); @@ -155,9 +155,12 @@ private: break; } ++loopCnt; - } while (loopCnt < ATVC::CONST10); // 10: 最多找10次,实际上 每个循环计算和拷贝间有流水同步,这里基本循环1次即可 + } while ( + // 10: At most 10 times can be searched, in fact, there is pipeline synchronization between + // each cycle calculation and copying. Here, basically one cycle is enough + loopCnt < ATVC::CONST10); - isBusyOut_[computeUnit_.idx] = true; // 标识isBusy_状态为busy + isBusyOut_[computeUnit_.idx] = true; return computeUnit_.idx; } @@ -173,10 +176,10 @@ private: event_t outEventId_[MAX_INPUT_SIZE]; bool isBusyOut_[MAX_INPUT_SIZE] = {false}; AscendC::TBuf<> qQue_; - AscendC::TPipe* pipe_; + AscendC::TPipe *pipe_; int32_t inputNum_; int32_t outputNum_; }; -} // namespace KernelUtils -} // namespace ATVC -#endif // ATVC_BROADCAST_BUF_POOL_H +} // namespace KernelUtils +} // namespace ATVC +#endif // ATVC_BROADCAST_BUF_POOL_H diff --git a/atvc/include/common/ops_utils_device.h b/atvc/include/common/ops_utils_device.h index 0d06d0b5..416dc2ed 100644 --- a/atvc/include/common/ops_utils_device.h +++ b/atvc/include/common/ops_utils_device.h @@ -1,78 +1,78 @@ -/** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. - * - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file ops_utils_device.h - * \brief - */ - -#ifndef ATVC_OPS_UTILS_DEVICE_H -#define ATVC_OPS_UTILS_DEVICE_H - -#include "kernel_operator.h" -namespace OpsUtils { -template -__aicore__ inline T Ceil(T a, T b) -{ - if (b == 0) { - return a; - } - return (a + b - 1) / b; -} - -template -__aicore__ inline T CeilAlign(T a, T b) -{ - if (b == 0) { - return a; - } - return (a + b - 1) / b * b; -} - -template -__aicore__ inline T CeilDiv(T a, T b) -{ - if (b == 0) { - return a; - } - return (a + b - 1) / b; -} - -template -__aicore__ inline T FloorDiv(T a, U b) -{ - if (b == 0) { - return a; - } - return a / b; -} - -template -__aicore__ inline T Aligned(T value, T alignment) -{ - if (alignment == 0) { - return value; - } - return (value + alignment - 1) / alignment * alignment; -} - -/** - * if align is 0, return 0 - */ -template -__aicore__ inline typename std::enable_if::value, T>::type FloorAlign(T x, U align) -{ - return align == 0 ? 0 : x / align * align; -} - -} // namespace OpsUtils - +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file ops_utils_device.h + * \brief + */ + +#ifndef ATVC_OPS_UTILS_DEVICE_H +#define ATVC_OPS_UTILS_DEVICE_H + +#include "kernel_operator.h" +namespace OpsUtils { +template +__aicore__ inline T Ceil(T a, T b) +{ + if (b == 0) { + return a; + } + return (a + b - 1) / b; +} + +template +__aicore__ inline T CeilAlign(T a, T b) +{ + if (b == 0) { + return a; + } + return (a + b - 1) / b * b; +} + +template +__aicore__ inline T CeilDiv(T a, T b) +{ + if (b == 0) { + return a; + } + return (a + b - 1) / b; +} + +template +__aicore__ inline T FloorDiv(T a, U b) +{ + if (b == 0) { + return a; + } + return a / b; +} + +template +__aicore__ inline T Aligned(T value, T alignment) +{ + if (alignment == 0) { + return value; + } + return (value + alignment - 1) / alignment * alignment; +} + +/** + * if align is 0, return 0 + */ +template +__aicore__ inline typename std::enable_if::value, T>::type FloorAlign(T x, U align) +{ + return align == 0 ? 0 : x / align * align; +} + +} // namespace OpsUtils + #endif // ATVC_OPS_UTILS_DEVICE_H \ No newline at end of file diff --git a/atvc/include/elewise/common/elewise_common.h b/atvc/include/elewise/common/elewise_common.h index fddde8ef..e008636a 100644 --- a/atvc/include/elewise/common/elewise_common.h +++ b/atvc/include/elewise/common/elewise_common.h @@ -14,18 +14,21 @@ namespace ATVC { struct EleWiseTilingData { - uint32_t tailBlockCnt; // 需要额外执行一次循环的核的数量 - uint32_t tailElemCnt; // 尾块元素数量 - uint32_t numPerBlock; // 每个核需计算的基本块个数 - uint32_t tiledCnt; // 基本块元素个数 - uint32_t blockNum; // 执行核数 + uint32_t tailBlockCnt; // The number of cores that need to execute an additional loop + uint32_t tailElemCnt; // The number of tail block elements + uint32_t numPerBlock; // The number of basic blocks to be calculated for each core + uint32_t tiledCnt; // The number of basic block elements + uint32_t blockNum; // Execute audit }; +/*! + * \brief EleWiseParam dynamic param of EleWise Template. + */ struct EleWiseParam { - EleWiseTilingData tilingData; // 影响数据搬运的相关参数 - uint32_t totalCnt = 0; // 单Tensor的元素个数 - uint32_t nBufferNum = 2; // 每个Queue中的Tensor数量 + EleWiseTilingData tilingData; // Related parameters affecting data handling + uint32_t totalCnt = 0; // The number of elements in a single Tensor + uint32_t nBufferNum = 2; // The number of Tensors in each queue }; -} +} // namespace ATVC #endif diff --git a/atvc/include/elewise/elewise_device.h b/atvc/include/elewise/elewise_device.h index e5bff70e..a35ee8f4 100644 --- a/atvc/include/elewise/elewise_device.h +++ b/atvc/include/elewise/elewise_device.h @@ -9,7 +9,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ - /*! * \file elewise_device.h * \brief @@ -25,4 +24,4 @@ #include "elewise/elewise_op_template.h" #include "kernel_operator.h" -#endif // ATVC_ELE_WISE_DEVICE_H \ No newline at end of file +#endif // ATVC_ELE_WISE_DEVICE_H \ No newline at end of file diff --git a/atvc/include/elewise/elewise_host.h b/atvc/include/elewise/elewise_host.h index ca91426e..ed5880af 100644 --- a/atvc/include/elewise/elewise_host.h +++ b/atvc/include/elewise/elewise_host.h @@ -9,7 +9,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ - /*! * \file elewise_host.h * \brief @@ -26,16 +25,16 @@ namespace ATVC { namespace Host { -constexpr uint32_t BASIC_CNT_MIN = 32; // Min number of basicBlock elements, because 32B alignment is required in UB -constexpr uint32_t MAX_SHAPE_NODE = 3; // number of Data segmentation node +constexpr uint32_t BASIC_CNT_MIN = 32; // Min number of basicBlock elements, because 32B alignment is required in UB +constexpr uint32_t MAX_SHAPE_NODE = 3; // number of Data segmentation node struct EleWiseTilingHyperParam { - uint32_t singleCoreBaseLine = 512; // data volume baseline for a core, the valid setting range: [256, 128 * 1024] - float ubSizeLimitThreshold = 0.95f; // UB memory usage upper limit,determines the maximum value of basicBlock - uint32_t nBufferNum = 2; // The number of parallelism buffer, the valid setting range: [1, 2] - uint32_t splitDataShape[MAX_SHAPE_NODE] = {1024, 32 * 1024, 64 * 1024}; // Segmentation nodes for shape - uint32_t dataSplitFactor[MAX_SHAPE_NODE + 1] = {4, 4, 8, 6}; // The split coefficient for each segmentation nodes - uint32_t rsvLiveCnt = 0; // Additional surviving nodes, means to reserve a portion of UB space. + uint32_t singleCoreBaseLine = 512; // data volume baseline for a core, the valid setting range: [256, 128 * 1024] + float ubSizeLimitThreshold = 0.95f; // UB memory usage upper limit,determines the maximum value of basicBlock + uint32_t nBufferNum = 2; // The number of parallelism buffer, the valid setting range: [1, 2] + uint32_t splitDataShape[MAX_SHAPE_NODE] = {1024, 32 * 1024, 64 * 1024}; // Segmentation nodes for shape + uint32_t dataSplitFactor[MAX_SHAPE_NODE + 1] = {4, 4, 8, 6}; // The split coefficient for each segmentation nodes + uint32_t rsvLiveCnt = 0; // Additional surviving nodes, means to reserve a portion of UB space. }; bool CheckEleWiseHyperParam(const EleWiseTilingHyperParam &hyperParam) @@ -87,17 +86,15 @@ bool CheckEleWiseHyperParam(const EleWiseTilingHyperParam &hyperParam) return true; } -int32_t GetEleWiseBasicCnt(const EleWiseTilingHyperParam &hyperParam, - int32_t totalCnt, - uint32_t blockNum, - uint32_t ubufLimitCnt) +int32_t GetEleWiseBasicCnt( + const EleWiseTilingHyperParam &hyperParam, int32_t totalCnt, uint32_t blockNum, uint32_t ubufLimitCnt) { uint32_t basicCnt = 0; if (blockNum == 0) { return 0; } uint32_t avgElePerBlock = totalCnt / blockNum; - for (uint32_t i =0; i < MAX_SHAPE_NODE; i++) { + for (uint32_t i = 0; i < MAX_SHAPE_NODE; i++) { if (avgElePerBlock <= hyperParam.splitDataShape[i]) { basicCnt = avgElePerBlock / hyperParam.dataSplitFactor[i]; break; @@ -106,28 +103,28 @@ int32_t GetEleWiseBasicCnt(const EleWiseTilingHyperParam &hyperParam, if (avgElePerBlock > hyperParam.splitDataShape[MAX_SHAPE_NODE - 1]) { basicCnt = avgElePerBlock / hyperParam.dataSplitFactor[MAX_SHAPE_NODE]; } - // 控制basicCnt不超上限 - if(basicCnt > ubufLimitCnt) { + // basicCnt must be smaller than the upper limit. + if (basicCnt > ubufLimitCnt) { basicCnt = ubufLimitCnt; } - // 保证UB数据块32B对齐 + // Ensure alignment of UB data block 32B basicCnt = basicCnt / BASIC_CNT_MIN * BASIC_CNT_MIN; - // 控制basicCnt不小于最小数据量 - if(basicCnt < BASIC_CNT_MIN) { + // Control basicCnt to not be less than the minimum data size + if (basicCnt < BASIC_CNT_MIN) { basicCnt = BASIC_CNT_MIN; } return basicCnt; } -/** - * @brief 计算EleWise的EleWiseParam运行态参数 - * @param totalCnt 单个输入的总元素个数 - * @param param 输出参数。 - * @return bool 返回true表示计算成功,false表示失败。 +/*! + * \brief Calculate the operational parameters of EleWiseParam for EleWise + * \param[in] totalCnt, the total number of elements in a single input + * \param[out] param, output parameters. + * \return Return true to indicate calculation success, false to indicate failure. */ template -bool CalcEleWiseTiling(int32_t totalCnt, ATVC::EleWiseParam ¶m, - EleWiseTilingHyperParam hyperParam = EleWiseTilingHyperParam()) +bool CalcEleWiseTiling( + int32_t totalCnt, ATVC::EleWiseParam ¶m, EleWiseTilingHyperParam hyperParam = EleWiseTilingHyperParam()) { if (!CheckEleWiseHyperParam(hyperParam)) { return false; @@ -135,11 +132,11 @@ bool CalcEleWiseTiling(int32_t totalCnt, ATVC::EleWiseParam ¶m, using Inputs = typename OpTraits::In::types; using Outputs = typename OpTraits::Out::types; using Temps = typename OpTraits::Temp::types; - // xxTensroSumBytes表示TensorList里面所有数据类型长度的累加值, xxTensroSumBytes = sum(sizeof(Tensor_i::type)) + // xxTensroSumbytes represents the cumulative length of all data types in tensorList, static constexpr size_t IN_TENSOR_SUM_BYTES = ATVC::TypeListReduce, SumSizes>::Type::VALUE; static constexpr size_t OUT_TENSOR_SUM_BYTES = ATVC::TypeListReduce, SumSizes>::Type::VALUE; static constexpr size_t TEMP_TENSOR_SUM_BYTES = ATVC::TypeListReduce, SumSizes>::Type::VALUE; - uint32_t tensorSumBytes = + uint32_t tensorSumBytes = (IN_TENSOR_SUM_BYTES + OUT_TENSOR_SUM_BYTES) * hyperParam.nBufferNum + TEMP_TENSOR_SUM_BYTES; if (hyperParam.rsvLiveCnt > 0) { tensorSumBytes = @@ -165,11 +162,11 @@ bool CalcEleWiseTiling(int32_t totalCnt, ATVC::EleWiseParam ¶m, uint32_t totalCopyCnt = totalCnt / basicCnt; param.tilingData.tailBlockCnt = (totalCopyCnt) % blockNum; param.tilingData.blockNum = blockNum; - param.tilingData.numPerBlock = totalCopyCnt / blockNum; // 每个block要搬运的基本块数量 - param.tilingData.tailElemCnt = totalCnt % basicCnt; // 尾块元素个数 + param.tilingData.numPerBlock = totalCopyCnt / blockNum; // The number of basic blocks to be transported per block + param.tilingData.tailElemCnt = totalCnt % basicCnt; // The number of tail block elements param.nBufferNum = hyperParam.nBufferNum; return true; }; -} -} // namespace ATVC -#endif // ATVC_ELE_WISE_HOST_H \ No newline at end of file +} // namespace Host +} // namespace ATVC +#endif // ATVC_ELE_WISE_HOST_H \ No newline at end of file diff --git a/atvc/include/elewise/elewise_op_template.h b/atvc/include/elewise/elewise_op_template.h index 28c4311d..e1a78c45 100644 --- a/atvc/include/elewise/elewise_op_template.h +++ b/atvc/include/elewise/elewise_op_template.h @@ -21,27 +21,42 @@ #include "common/atvc_opdef.h" #include "common/const_def.h" #include "elewise/common/elewise_common.h" -#include "elewise/elewise_utils/elewise_util.h" +#include "elewise/utils/elewise_util.h" namespace ATVC { namespace Kernel { +/*! + * \brief EleWiseOpTemplate provides templates for element level operations on tensors, + * including but not limited to addition, subtraction, multiplication, division, as well as + * mathematical functions such as exponentiation, logarithm, trigonometric functions, etc. + * The characteristic of this type of operator is that it performs calculation operations + * element by element without changing the shape of the input data. + */ template class EleWiseOpTemplate { using EleWiseOpTraits = typename GetFunctionTraits::ComputeTraits; using Inputs = typename EleWiseOpTraits::In::types; using Outputs = typename EleWiseOpTraits::Out::types; using Temps = typename EleWiseOpTraits::Temp::types; - // xxCount表示TensorList里面有几个Tensor + // xCount represents how many tensors are in the tensorList static constexpr size_t INPUT_COUNT = ATVC::TypeListSize::VALUE; static constexpr size_t OUTPUT_COUNT = ATVC::TypeListSize::VALUE; static constexpr size_t TEMP_COUNT = ATVC::TypeListSize::VALUE; - // xxTensroSumBytes表示TensorList里面所有数据类型长度的累加值, xxTensroSumBytes = sum(sizeof(Tensor_i::type)) + // xxTensroSumbytes represents the cumulative length of all data types in tensorList, + // xxTensroSumBytes = sum(sizeof(Tensor_i::type)) static constexpr size_t IN_TENSOR_SUM_BYTES = ATVC::TypeListReduce, SumSizes>::Type::VALUE; static constexpr size_t OUT_TENSOR_SUM_BYTES = ATVC::TypeListReduce, SumSizes>::Type::VALUE; static constexpr size_t TEMP_TENSOR_SUM_BYTES = ATVC::TypeListReduce, SumSizes>::Type::VALUE; public: __aicore__ inline EleWiseOpTemplate() {} + /*! + * \brief The external running interface of EleWiseOpTemplate mainly completes resource initialization, + * data migration, calculation scheduling, and data migration operations + * \param src, GM pointer for input data + * \param dst, Gm pointer for outputting data + * \param broadcastParam, dynamic parameters of broadcast, including tiling data, workspace, etc + */ template __aicore__ inline void Run(Args&&... args) { @@ -109,10 +124,11 @@ private: return true; } - // 申请LocalTensor等资源,初始化本核计算的GlobalTensor + // Apply for localTensor and other resources to initialize the globalTensor for local core computing __aicore__ inline void Init() { - // in/out/temp各自使用一个pipe进行管理,每个pipe里面管理的是ub地址连续的多个tensor + // Each in/out/temp uses a pipe for management, + // and each pipe manages multiple tensors with consecutive sub addresses if constexpr (INPUT_COUNT > 0) { g_pipe.InitBuffer(inQueue, param_->nBufferNum, param_->tilingData.tiledCnt * IN_TENSOR_SUM_BYTES); } @@ -123,8 +139,8 @@ private: g_pipe.InitBuffer(tempQueue, param_->tilingData.tiledCnt * TEMP_TENSOR_SUM_BYTES); } } - // 根据tiling循环调用CopyIn/CopyOut,以及外部传入的Compute计算 - // 如果有尾块,则处理尾块的CopyIn/Compute/CopyOut + // Call CopyIn/CopyOut based on the tiling loop, as well as externally passed Compute calculations + // If there is a tail block, process the CopyIn/Compute/CopyOut of the tail block template __aicore__ inline void Process(Args&&... args) { @@ -140,7 +156,7 @@ private: uint32_t tailCnt = curCoreCnt_ % param_->tilingData.tiledCnt; offsetCnt_ = 0; caclCnt_ = param_->tilingData.tiledCnt; - // 循环处理主块数据 + // Loop processing of main block data for (uint32_t i = 0; i < repeat; i++) { CopyIn(inTensors, ATVC::MakeIndexSequence{}); Compute(inTensors, outTensors, tempTensors, ATVC::MakeIndexSequence{}, @@ -150,7 +166,7 @@ private: offsetCnt_ += caclCnt_; } - // 如果有尾块,则处理尾块 + // If there is a tail block, process the tail block if (tailCnt > 0) { caclCnt_ = tailCnt; CopyIn(inTensors, ATVC::MakeIndexSequence{}); @@ -160,11 +176,11 @@ private: CopyOut(outTensors, ATVC::MakeIndexSequence{}); } } - // 模拟单个 Tensor 的处理逻辑:入参为类型 T 对应的 Tensor 变量 + // Simulate the processing logic of a single Tensor: input the Tensor variable corresponding to type T template __aicore__ inline void CopyInAllTensors(AscendC::LocalTensor& inLocal, int32_t i, T& tensorInfo) { - // 单个 Tensor 的处理逻辑 + // The processing logic of a single Tensor auto inLocalI = inLocal[tensorInfo.localOffset].template ReinterpretCast(); using DataType = typename T::DataType; @@ -185,10 +201,10 @@ private: } } - // 对应于没有 Tensor 时候的空处理逻辑 + // Corresponding to the empty processing logic without Tensor __aicore__ inline void CopyInAllTensors(AscendC::LocalTensor& inLocal, int32_t i) {} - // 所有 Tensor 的处理入口逻辑:递归完成对每个 Tensor 的处理 + // The entry logic for processing all Tensors: recursively completing the processing of each Tensor template __aicore__ inline void CopyInAllTensors(AscendC::LocalTensor& inLocal, int32_t i, T& first, Ts&... rest) { @@ -196,7 +212,7 @@ private: CopyInAllTensors(inLocal, ++i, rest...); } - // 将所有输入tensor从gm拷贝到local + // Copy all input tensors from gm to local template __aicore__ inline void CopyIn(InTuple& inTensors, ATVC::IndexSequence) { @@ -205,18 +221,18 @@ private: return; } AscendC::LocalTensor inLocal = inQueue.template AllocTensor(); - // TypeListGetOffset是从TypeList里面获得当前Tensor以Bytes为单位的偏移 - // 例如TypeList,tensor_0的偏移为0, tensor_1的偏移为sizeof(float), - // tensor_2的偏移为sizeof(float)+sizeof(half) + // TypeListGetOffset is the offset of the current Tensor in bytes obtained from the TypeList + // For example, TypeList, tensor_1 has an offset of 0, + // tensor_1 has an offset of sizeof (float), and tensor_2 has an offset of sizeof (float)+sizeof (half) CopyInAllTensors(inLocal, 0, ATVC::TupleElemGet(inTensors)...); inQueue.EnQue(inLocal); } - // 模拟单个 Tensor 的处理逻辑:入参为类型 T 对应的 Tensor 变量 + // Simulate the processing logic of a single Tensor: input the Tensor variable corresponding to type T template __aicore__ inline void CopyOutAllTensors(AscendC::LocalTensor& outLocal, int32_t i, T& tensorInfo) { - // 单个 Tensor 的处理逻辑 + // The processing logic of a single Tensor auto outLocalI = outLocal[tensorInfo.localOffset].template ReinterpretCast(); using DataType = typename T::DataType; constexpr uint32_t TYPE_ALIGN_CNT = 32 / sizeof(DataType); @@ -234,11 +250,11 @@ private: } } - // 对应于没有 Tensor 时候的空处理逻辑 + // Corresponding to the empty processing logic without Tensor __aicore__ inline void CopyOutAllTensors(AscendC::LocalTensor& outLocal, int32_t i) { } - // 所有 Tensor 的处理入口逻辑:递归完成对每个 Tensor 的处理 + // The entry logic for processing all Tensors: recursively completing the processing of each Tensor template __aicore__ inline void CopyOutAllTensors(AscendC::LocalTensor &outLocal, int32_t i, @@ -247,7 +263,7 @@ private: CopyOutAllTensors(outLocal, i, first); CopyOutAllTensors(outLocal, ++i, rest...); } - // 将所有输出tensor拷贝到gm + // Copy all output tensors to gm template __aicore__ inline void CopyOut(OutTuple& outTensors, ATVC::IndexSequence) { @@ -278,7 +294,7 @@ private: template __aicore__ inline void InitInputTensors(InTuple& tuple, std::size_t cnt, ATVC::IndexSequence) { - // 初始化每个 Tensor + // Initialize each Tensor [[maybe_unused]] int32_t dummy[] = {0, (InitInputTensor(ATVC::TupleElemGet(tuple), cnt, Is), 0)...}; } @@ -346,7 +362,6 @@ private: AscendC::TQue outQueue; AscendC::TBuf tempQueue; - // 全局变量 AscendC::GlobalTensor inGMAddrs_[INPUT_COUNT]; AscendC::GlobalTensor outGMAddrs_[OUTPUT_COUNT]; @@ -354,7 +369,7 @@ private: std::size_t outOffsets_[OUTPUT_COUNT]; std::size_t tempOffsets_[TEMP_COUNT]; - // 计算得到的tiling数据 + // Calculated tiling data ATVC::EleWiseParam* param_; uint32_t curCoreCnt_; @@ -362,7 +377,7 @@ private: int32_t offsetCnt_; int32_t caclCnt_; - // 算子开发者传入的计算对象 + // The calculation object passed in by user EleWiseCompute compute_; }; } diff --git a/atvc/include/elewise/elewise_utils/elewise_util.h b/atvc/include/elewise/utils/elewise_util.h similarity index 82% rename from atvc/include/elewise/elewise_utils/elewise_util.h rename to atvc/include/elewise/utils/elewise_util.h index 1a2d9d90..f5154281 100644 --- a/atvc/include/elewise/elewise_utils/elewise_util.h +++ b/atvc/include/elewise/utils/elewise_util.h @@ -22,11 +22,12 @@ namespace ATVC { namespace KernelUtils { template -__aicore__ inline void PrintParam(const T* param) +__aicore__ inline void PrintParam(const T *param) { ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: blockNum = %u\n", param->tilingData.blockNum); ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: tiledCnt = %u\n", param->tilingData.tiledCnt); - ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: tailBlockCnt = %u\n", param->tilingData.tailBlockCnt); + ATVC::Kernel::DebugPrintf( + "[INFO]:[ATVC][EleWise] Tiling data: tailBlockCnt = %u\n", param->tilingData.tailBlockCnt); ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: numPerBlock = %u\n", param->tilingData.numPerBlock); ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: tailElemCnt = %u\n", param->tilingData.tailElemCnt); ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Param: nBufferNum = %u\n", param->nBufferNum); @@ -35,27 +36,32 @@ __aicore__ inline void PrintParam(const T* param) } template -__aicore__ inline bool CheckParam(const T* param) +__aicore__ inline bool CheckParam(const T *param) { auto *tilingData = ¶m->tilingData; if (tilingData->blockNum < AscendC::GetBlockIdx() + 1) { ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Tiling data[blockNum = %u] is invalid," - "it must be larger than current block number.\n", tilingData->blockNum); + "it must be larger than current block number.\n", + tilingData->blockNum); return false; } if (tilingData->tailElemCnt > tilingData->tiledCnt) { ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Tiling data[tailElemCnt = %u] is invalid," - "it must be smaller than tiledCnt(%u).\n", tilingData->blockNum, tilingData->tiledCnt); + "it must be smaller than tiledCnt(%u).\n", + tilingData->blockNum, + tilingData->tiledCnt); return false; } if (tilingData->tailBlockCnt > tilingData->blockNum) { ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Tiling data[tailBlockCnt = %u] is invalid," - "it must be smaller than blockNum(%u).\n", tilingData->blockNum, tilingData->blockNum); + "it must be smaller than blockNum(%u).\n", + tilingData->blockNum, + tilingData->blockNum); return false; } return true; } -} // namespace KernelUtils -} // namespace ATVC +} // namespace KernelUtils +} // namespace ATVC #endif \ No newline at end of file diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index 480c04d2..94c6e050 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -124,20 +124,22 @@ static constexpr ReducePolicy REDUCE_POLICY21 { AR_PATTERN::ARARA, AR_COUNT::A3R static constexpr ReducePolicy REDUCE_POLICY22 { AR_PATTERN::ARARA, AR_COUNT::A2R0, 0 }; struct ReduceTilingData { - uint64_t factorACntPerCore; - uint64_t factorATotalCnt; - uint64_t ubFactorA; - uint64_t factorRCntPerCore; - uint64_t factorRTotalCnt; - uint64_t ubFactorR; - uint64_t groupR; - uint64_t outSize; - uint64_t basicBlock; - int32_t coreNum; - float meanVar; - uint64_t shape[MAX_DIM]; - uint64_t stride[MAX_DIM]; - uint64_t dstStride[MAX_DIM]; + uint64_t factorACntPerCore; // The actual dimensions of non Reduce axes that do not participate + // in computation on each core + uint64_t factorATotalCnt; // The total dimension of non Reduce axes that do not participate in the calculation + uint64_t ubFactorA; // The amount of data on non Reduce axes within a single UB + uint64_t factorRCntPerCore; // The actual dimension of the Reduce axis involved in computation on each core + uint64_t factorRTotalCnt; // The total dimension of the Reduce axis involved in the calculation + uint64_t ubFactorR; // Reduce axis dimension involved in calculation within a single UB + uint64_t groupR; // The tangent axis is the R axis, and the relative data amount of R + // outside the tangent point on this axis + uint64_t outSize; // The total amount of AR data outside the cutting axis + uint64_t basicBlock; // The basic data block size + int32_t coreNum; // The number of running cores + float meanVar; // Reserved + uint64_t shape[MAX_DIM]; // Shape info + uint64_t stride[MAX_DIM]; // Input data transfer step size + uint64_t dstStride[MAX_DIM]; // Output data transfer step size }; struct ReduceParam { diff --git a/atvc/include/reduce/reduce_host.h b/atvc/include/reduce/reduce_host.h index 93348539..412f828c 100644 --- a/atvc/include/reduce/reduce_host.h +++ b/atvc/include/reduce/reduce_host.h @@ -22,7 +22,7 @@ namespace ATVC { namespace Host { /*! * \brief Validate the legitimacy of reduce tiling hyper param - * \param [in] hyperParam, reduce tiling hyper param + * \param[in] hyperParam, reduce tiling hyper param * \return bool result, return true if the hyper param is valid, otherwise return false. */ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam) @@ -56,7 +56,7 @@ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam) /*! * \brief Calculate the TilingData and policy parameters for Reduce. * \param[in] inputShape, shape of the tensor. - * \param[in] reduceDim, The dim that requires a Reduce operation. + * \param[in] reduceDim, the dim that requires a Reduce operation. * \param[out] policy, static policy of Reduce Template * \param[out] param, dynamic param of Reduce Template * \return bool Return true to indicate calculation success, false to indicate failure. diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h index fe466a09..4cdde076 100644 --- a/atvc/include/reduce/reduce_op_template.h +++ b/atvc/include/reduce/reduce_op_template.h @@ -71,7 +71,8 @@ public: * \return void. */ template - __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param) { + __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param) + { ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Start to run template function.\n"); param_ = param; KernelUtils::PrintParam(param_); @@ -112,7 +113,7 @@ public: * \tparam IsInput, true – tensor will be used as input (read-only) * false – tensor will be used as output (read-write) * \tparam needDup, if true the buffer is duplicated (for double-buffering). - * \param[in] tensor, LocalTensor reference that receives the allocation. + * \param[in] tensor, localTensor reference that receives the allocation. */ template __aicore__ inline void AllocTensorAux(AscendC::LocalTensor& tensor) @@ -176,7 +177,7 @@ public: /*! * \brief Populate tiling data for the second (group) reduction phase. - * \param[in] groupTiling, Tiling structure to be filled. + * \param[in] groupTiling, tiling structure to be filled. * \return void. */ __aicore__ inline void SetGroupTiling(ATVC::ReduceTilingData& groupTiling) @@ -202,10 +203,10 @@ public: * \brief Copy input tensor to UB with optional padding. * \tparam isPadding, true – perform padding using PreCompute::GetPaddingValue * false – no padding - * \param[in] src, GlobalTensor source in GM. - * \param[in] view, View descriptor describing the copy geometry. - * \param[in] shape, Shape descriptor (modified when padding). - * \param[in] ubTensor, LocalTensor destination in UB. + * \param[in] src, globalTensor source in GM. + * \param[in] view, view descriptor describing the copy geometry. + * \param[in] shape, shape descriptor (modified when padding). + * \param[in] ubTensor, localTensor destination in UB. */ template __aicore__ inline void CopyInAux(const AscendC::GlobalTensor &src, diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h index 08a24bf0..ac1ce5ea 100644 --- a/atvc/include/reduce/reduce_sum.h +++ b/atvc/include/reduce/reduce_sum.h @@ -41,7 +41,7 @@ namespace ATVC { template class ReduceSumCompute { public: - // 从OpTraits中萃取算子输入描述信息 + // Extract operator input description information from OpTraits using inputDTypeList = typename OpTraits::In::types; using DataType = typename ATVC::TypeListGet::Type; using PrompteDtype = typename KernelUtils::GetPromoteType::T; @@ -49,11 +49,11 @@ public: /*! * \brief Perform the actual reduction on a tile already resident in UB. - * \tparam needMask, True when UB alignment introduced invalid lanes. - * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA. + * \tparam needMask, true when UB alignment introduced invalid lanes. + * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA. * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. - * \param[out] dst, Destination tensor (length == dimA) - * \param[in] src, Source tensor (length == dimA * dimR) + * \param[out] dst, destination tensor (length == dimA) + * \param[in] src, source tensor (length == dimA * dimR) */ template __aicore__ inline void @@ -87,11 +87,11 @@ public: /*! * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. - * \param[out] dst, Output tensor (length == dimA) - * \param[in] src, Input tensor (length == dimR * dimA), already resident in UB - * \param[in] dimA, Length of the non-reduced axis (A) - * \param[in] dimR, Length of the reduced axis (R) - * \param[in] mainR, Largest power-of-two ≤ dimR (computed by the caller) + * \param[out] dst, output tensor (length == dimA) + * \param[in] src, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller) */ __aicore__ inline void ReduceRA(const AscendC::LocalTensor &dst, @@ -153,12 +153,12 @@ public: /*! * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. - * \param[out] dstTensor, Output tensor (length == dimA) - * \param[in] srcTensor, Input tensor (length == dimR * dimA), already resident in UB - * \param[in] dimA, Length of the non-reduced axis (A) - * \param[in] dimR, Padded length of the reduced axis (R) - * \param[in] mainR, Largest power-of-two ≤ original R length - * \param[in] oriBurstLen, Original (un-padded) R length used to compute tail + * \param[out] dstTensor, output tensor (length == dimA) + * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, padded length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ original R length + * \param[in] oriBurstLen, original (un-padded) R length used to compute tail */ __aicore__ inline void ReduceAR(const AscendC::LocalTensor &dstTensor, @@ -197,6 +197,15 @@ public: PerformFinalReduction(dstTensor, srcTensor, param); } + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, logical index identifying the data-base block. + * \param[in] shape, runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ template __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, const AscendC::LocalTensor& computeRes) @@ -229,9 +238,9 @@ public: /*! * \brief Binary reduction between two UB buffers. * \ Used for inter-core result merging when workspace staging is required. - * \param[in] ubTensorLeft, Left operand (in-place result). - * \param[in] ubTensorRight, Right operand (read-only). - * \param[in] calCount, Number of elements to reduce. + * \param[in] ubTensorLeft, left operand (in-place result). + * \param[in] ubTensorRight, right operand (read-only). + * \param[in] calCount, number of elements to reduce. */ __aicore__ inline void ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, @@ -244,7 +253,7 @@ public: /*! * \brief Return the value used for padding when UB alignment is required. * For SUM-reduction the neutral element is 0. - * \tparam U, Scalar type identical to DataType or PromoteDataType. + * \tparam U, scalar type identical to DataType or PromoteDataType. * \return The padding value (always 0). */ template diff --git a/atvc/include/reduce/tiling/tiling_common.h b/atvc/include/reduce/tiling/tiling_common.h index 6d5c6dcf..e168b4cb 100644 --- a/atvc/include/reduce/tiling/tiling_common.h +++ b/atvc/include/reduce/tiling/tiling_common.h @@ -1,117 +1,117 @@ -/** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. - * - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -#ifndef ATVC_TILING_COMMON_H -#define ATVC_TILING_COMMON_H -#include "reduce/common/patterns.h" - -namespace OpTiling { -constexpr static int32_t CACHE_SIZE = 16 * 1024; // cahce size for ub reduce -constexpr static int32_t A_STEP_LEN = 4; - -struct ReduceTilingUnit { - int32_t idx = -1; // ub cut axis - uint64_t inner = 1; // inner size in ub - uint64_t outer = 1; // outer size of ub - uint64_t step = 1; // step of cacheline - void Update(int32_t idx, uint64_t inner, uint64_t outer, uint64_t step) - { - this->idx = idx; - this->inner = inner; - this->outer = outer; - this->step = step; - } -}; - -struct CacheLineBlock { - int32_t axis = -1; // cacheline cut axis - uint64_t size = 1; // cacheline size - uint64_t cacheLineStep = 1; // cacheline cut size for axis - uint64_t cacheLineOuter = 1; // relative to cacheLineStep, out size of cacheline cut axis - uint64_t aSize = 1; // A axis size in cacheline -}; - -struct ReduceTilingInputParam { - std::vector reduceDim = {}; - std::vector reduceShape = {}; - ge::DataType inputDtype = ge::DataType::DT_UNDEFINED; - ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED; - ReduceTilingInputParam(std::vector reduceDim_, std::vector reduceShape_, ge::DataType inputDtype_, - ge::DataType promoteDtpye_) - : reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_) - {} -}; - -void MakeWrapDim(const std::vector &shape, std::vector &axes) -{ - // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0 - size_t shapeSize = shape.size(); - for (size_t i = 0; i < axes.size(); i++) { - if (axes[i] < 0) { - axes[i] += shapeSize; - } - } -} - -template -bool IsAxisA(int32_t idx) -{ - if (Pattern::FirstA) { - return idx % ATVC::CONST2 == 0; - } else { - return idx % ATVC::CONST2 == 1; - } -} - -int32_t IsAxesValid(const std::vector &shape, const std::vector &axes) -{ - size_t shapeSize = shape.size(); - size_t axesSize = axes.size(); - if (axesSize > shapeSize) { - printf("[ERROR]: [ATVC][Reduce] Axis size is greater than shape size.\n"); - return -1; - }; - - for (size_t i = 0; i < axesSize; i++) { - if (axes[i] >= static_cast(shapeSize) || axes[i] < 0) { - printf("[ERROR]: [ATVC][Reduce] Axis size incorrect.\n"); - return -1; - }; - } - return 0; -} - -template -bool IsEmtpyTensor(const std::vector &shape) -{ - for (int32_t i = 0; i < Pattern::Dim; i++) { - if (shape[i] == 0) { - return true; - } - } - return false; -} - -}; // namespace OpTiling - -namespace ATVC { -namespace Host { -// Hyper param for reduce tiling. -struct ReduceTilingHyperParam { - // Set the basic block memory size for Reduce, generally not exceeding 1/3 of the memory. It is recommended to set - // it between [48k-54k] - uint32_t basicBlock = 48 * 1024; - uint32_t maxInnerA = 128; // [128, 256] - double balanceThreshHold = 0.85; // Threshold level for multi-core equilibrium [0.8-0.95] -}; -} // namespace Host -} // namespace ATVC +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_TILING_COMMON_H +#define ATVC_TILING_COMMON_H +#include "reduce/common/patterns.h" + +namespace OpTiling { +constexpr static int32_t CACHE_SIZE = 16 * 1024; // cahce size for ub reduce +constexpr static int32_t A_STEP_LEN = 4; + +struct ReduceTilingUnit { + int32_t idx = -1; // ub cut axis + uint64_t inner = 1; // inner size in ub + uint64_t outer = 1; // outer size of ub + uint64_t step = 1; // step of cacheline + void Update(int32_t idx, uint64_t inner, uint64_t outer, uint64_t step) + { + this->idx = idx; + this->inner = inner; + this->outer = outer; + this->step = step; + } +}; + +struct CacheLineBlock { + int32_t axis = -1; // cacheline cut axis + uint64_t size = 1; // cacheline size + uint64_t cacheLineStep = 1; // cacheline cut size for axis + uint64_t cacheLineOuter = 1; // relative to cacheLineStep, out size of cacheline cut axis + uint64_t aSize = 1; // A axis size in cacheline +}; + +struct ReduceTilingInputParam { + std::vector reduceDim = {}; + std::vector reduceShape = {}; + ge::DataType inputDtype = ge::DataType::DT_UNDEFINED; + ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED; + ReduceTilingInputParam(std::vector reduceDim_, std::vector reduceShape_, ge::DataType inputDtype_, + ge::DataType promoteDtpye_) + : reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_) + {} +}; + +void MakeWrapDim(const std::vector &shape, std::vector &axes) +{ + // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0 + size_t shapeSize = shape.size(); + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] < 0) { + axes[i] += shapeSize; + } + } +} + +template +bool IsAxisA(int32_t idx) +{ + if (Pattern::FirstA) { + return idx % ATVC::CONST2 == 0; + } else { + return idx % ATVC::CONST2 == 1; + } +} + +int32_t IsAxesValid(const std::vector &shape, const std::vector &axes) +{ + size_t shapeSize = shape.size(); + size_t axesSize = axes.size(); + if (axesSize > shapeSize) { + printf("[ERROR]: [ATVC][Reduce] Axis size is greater than shape size.\n"); + return -1; + }; + + for (size_t i = 0; i < axesSize; i++) { + if (axes[i] >= static_cast(shapeSize) || axes[i] < 0) { + printf("[ERROR]: [ATVC][Reduce] Axis size incorrect.\n"); + return -1; + }; + } + return 0; +} + +template +bool IsEmtpyTensor(const std::vector &shape) +{ + for (int32_t i = 0; i < Pattern::Dim; i++) { + if (shape[i] == 0) { + return true; + } + } + return false; +} + +}; // namespace OpTiling + +namespace ATVC { +namespace Host { +// Hyper param for reduce tiling. +struct ReduceTilingHyperParam { + // Set the basic block memory size for Reduce, generally not exceeding 1/3 of the memory. It is recommended to set + // it between [48k-54k] + uint32_t basicBlock = 48 * 1024; + uint32_t maxInnerA = 128; // [128, 256] + double balanceThreshHold = 0.85; // Threshold level for multi-core equilibrium [0.8-0.95] +}; +} // namespace Host +} // namespace ATVC #endif // ATVC_TILING_COMMON_H \ No newline at end of file -- Gitee