diff --git a/LICENSE b/LICENSE index 72a0affe0e736781fc243ff6ec56571efd279641..b41cefbb51aa768c9ea00d3c92a6261fbaf04456 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ -CANN Open Software License Agreement Version 1.0 +CANN Open Software License Agreement Version 2.0 -This CANN Open Software License Agreement Version 1.0 (hereinafter referred to as this "Agreement") is a legal agreement between you and Huawei, and it governs your use, modification, or distribution of CANN Open Software (hereinafter referred to as "Software"). Please read this Agreement carefully. +This CANN Open Software License Agreement Version 2.0 (hereinafter referred to as this "Agreement") is a legal agreement between you and Huawei, and it governs your use, modification, or distribution of CANN Open Software (hereinafter referred to as "Software"). Please read this Agreement carefully. If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind that entity to this Agreement, in which case "you" will mean the entity you represent. @@ -10,28 +10,27 @@ BY DOWNLOADING, INSTALLING, OR USING THE SOFTWARE, YOU AGREE YOU HAVE FULLY UNDE 1.1 Software means the APIs, source code files, binaries, and related documents of Compute Architecture for Neural Networks("CANN") that are licensable by Huawei, and provided and licensed under this Agreement. -1.2 Ascend processors means the chipsets branded with "Ascend" that are manufactured and supplied by Huawei. +1.2 Huawei AI Processors mean AI chipsets (i) branded with "Ascend", "Kirin", "Yueying" or other brands owned or controlled by Huawei; or (ii) manufactured (including have manufactured), supplied (including have supplied) or designed (including have designed) by Huawei. 2. Grant of Intellectual Property Rights -Subject to the terms and conditions of this Agreement, including your full compliance thereof, Huawei hereby grants you a limited, worldwide, royalty-free, non-transferable, non-sublicensable, and revocable license for you to (i) download, use, modify, integrate, and distribute the Software or its derivative works for the purpose of developing software solely for use with Ascend processors, and (ii) distribute the software developed under (i) solely for use with Ascend processors. +2.1 Subject to the terms and conditions of this Agreement, including your full compliance thereof, Huawei hereby grants you a limited, worldwide, royalty-free, non-transferable, non-sublicensable, and revocable license for you to (i) download, use, modify, integrate, and distribute the Software or its derivative works for the purpose of developing software solely for use in systems with Huawei AI Processors and/or Software, and (ii) distribute any software developed based upon Software and/or its derivative works solely for use in systems with Huawei AI Processors and/or Software. 3. Restrictions -3.1 You are not authorized to, and shall not use, modify, or distribute this Software or its derivative works for any purpose except those expressly permitted by this Agreement. You shall not make any use of the Software or its derivative works to develop or distribute software for use in systems with processors other than Ascend processors. +3.1 You are not authorized to, and shall not use, modify, or distribute this Software or its derivative works for any other purposes than those expressly permitted by this Agreement. You shall not make any use of the Software or its derivative works to develop or distribute any software for use in systems with processors other than Huawei AI processors. All rights not expressly granted herein are expressly reserved by Huawei. 3.2 You are not authorized to, and shall not remove, obscure, or alter any copyright or other notices in this Software or any part of it. -3.3 Distribution Restrictions. -You may distribute the Software or its derivative works in any medium, whether in source or executable forms, provided that you comply with the purpose restriction stipulated in Section 2, provide recipients with a copy of this Agreement, and retain all notices in the Software. +3.3 Distribution Restrictions +You may distribute the Software or its derivative works in any medium, whether in source or executable forms, for the purpose stipulated in Section 2; provided that you provide recipients with a copy of this Agreement, and retain all notices in the Software. 4. Disclaimer of Warranty and Limitation of Liability -THE SOFTWARE IS PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL HUAWEI OR ANY OTHER COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE, IN WHOLE OR IN PART, NO MATTER HOW IT’S CAUSED OR THE LEGAL THEORY IT IS BASED ON, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +THE SOFTWARE IS PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL HUAWEI OR ANY OTHER COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE, IN WHOLE OR IN PART, NO MATTER HOW IT IS CAUSED OR THE LEGAL THEORY IT IS BASED ON, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 5. Termination 5.1 This Agreement will continue to apply until terminated by either you or Huawei as described below: a.You may terminate this Agreement by ceasing your use of the Software; b. Huawei may at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement; or (ii) you directly or indirectly initiate any legal proceeding against any individual or entity by alleging that the Software or any part of it infringes your intellectual property rights. - -5.2 By termination, all the rights granted to you under this Agreement are terminated, and you shall cease to use and delete this Software or its derivative works immediately. +5.2 By termination, all the rights granted to you under this Agreement are terminated, and you shall cease to use and delete this Software or any derivative works immediately. All rights granted to you under this Agreement shall hereby be void ab initio in the event of termination in accordance with Section 5.1. b above. Huawei reserves the right to pursue any and all legal remedies available to enforce the terms and conditions of this Agreement or to protect Huawei’s intellectual property rights for such breach or violation. All provisions shall survive the termination of this Agreement except for Section 2 and Section 3.3. 6. MISCELLANEOUS If the application of any provision of this Agreement to any particular facts or circumstances is held to be invalid or unenforceable by a court of competent jurisdiction, then (a) the validity and enforceability of such provision as applied to any other particular facts or circumstances and the validity of other provisions of this Agreement shall not in any way be affected or impaired thereby and (b) such provision shall be enforced to the maximum extent possible so as to affect the intent of the you and Huawei and reformed without further action by you and Huawei to the extent necessary to make such provision valid and enforceable. diff --git a/OAT.xml b/OAT.xml index 38b6baddd7fb5f29a2a178c077fa29fdd76a4853..5566e6d583c8cba5f83d6fe1b4ef35d994e69ad4 100644 --- a/OAT.xml +++ b/OAT.xml @@ -34,6 +34,8 @@
@@ -20,7 +20,7 @@ Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件 ``` ├── docs // 文档介绍 ├── examples // ATVC使用样例 -├── include // ATVC提供的头文件集合,用户使用前需将其置入其他工程的包含路径下 +├── include // ATVC提供的头文件集合,用户使用前需将其置入其他工程的包含路径下 └── README.md // 综述 ``` [Developer Guide](./docs/02_developer_guide.md)给出了ATVC框架各模板与API的使用细节。 @@ -31,7 +31,7 @@ Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件 ## 环境准备 -参考[ascendc-api-adv仓通用环境准备章节](../../README.md),完成源码下载和CANN软件包及相关依赖的安装。 +参考[ascendc-api-adv仓通用环境准备章节](../README.md),完成源码下载和CANN软件包及相关依赖的安装。 ## ATVC模板库算子调试方式 - ATVC是一个头文件集合,只需要包含头文件目录即可使用ATVC模板能力进行算子开发。 @@ -44,7 +44,7 @@ cd ./atvc/examples bash run_examples.sh {op_name} ``` - 支持上板运行打开profiling获取性能数据, 运行命令为:bash run_examples.sh {op_name} --run-mode=profiling。 -- 支持上板打印ATVC模板库提供的DFX信息,运行命令为:bash run_examples.sh {op_name} --run-mode=debug_print +- 支持上板打印ATVC模板库提供的DFX信息,运行命令为:bash run_examples.sh {op_name} --run-mode=debug_print。 ## 模板选择 ATVC支持的模板和数据类型如下: diff --git a/atvc/docs/02_developer_guide.md b/atvc/docs/02_developer_guide.md index 62ba53e69f8f4640a4c6d134709f8357f11ba9f8..1bfff9ba2a956bf766ba6398c8c3d81ce505d766 100644 --- a/atvc/docs/02_developer_guide.md +++ b/atvc/docs/02_developer_guide.md @@ -11,7 +11,7 @@ ATVC将Vector算子开发流程中的可定制化模块抽象出了Host层和Ker
# 2 公共数据结构 -我们将对ATVC核函数定义以及调用涉及的三个公共数据概念:算子原型的编译态参数`OpTraits`, Tiling计算的运行态参数`Param`, 模板策略的编译态参数`Policy` 分别进行介绍。 +我们将对ATVC核函数定义以及调用涉及的三个公共数据概念:算子原型的编译态参数`OpTraits`,Tiling计算的运行态参数`Param`,模板策略的编译态参数`Policy` 分别进行介绍。 ## 2.1 OpTraits ATVC框架参考C++模板元编程的`type_list`实现,推出了`ATVC::OpInputs`、`ATVC::OpOutputs`、`ATVC::OpTemps`的模板结构体分别用于描述算子的计算输入、计算输出、计算过程的临时资源,支持C++基础类型作为不定长模板参数传入。它们三者组成了整个ATVC框架编译态参数`OpTraits`。`ATVC::OpTraits`的完整数据定义如下:
@@ -56,7 +56,7 @@ struct OpTraits { // Add算子计算原型 : c = a + b using AddInputs = ATVC::OpInputs; // Add对应两个输入,类型均为float using AddOutputs = ATVC::OpOutputs; // Add有一个输出,类型为float -using AddTemps = ATVC::OpTemps<>; // 运算过程中不需要临时buffer保存中间结果,模板参数为空 +using AddTemps = ATVC::OpTemps<>; // 运算过程中不需要临时buffer保存中间结果,模板参数为空 using AddOpTraits = ATVC::OpTraits; // Add算子的计算原型描述 ``` @@ -81,7 +81,7 @@ EleWiseKernel<<>>(x, y, z, paramDevi ``` ## 2.3 Policy -编译态参数`Policy`(`ATVC::ReducePolicy`, `ATVC::BroadcastPolicy`)是ATVC框架里Kernel层对部分算子模板的拓展描述,它对应算子模板类在不同场景的实例化实现。它由Tiling API计算出,并在策略分派API(`ATVC::Host::ReduceAdapter`)里将运行态的Policy结果转化为模板参数并调用该场景下的最佳模板实现来完成高效的数据计算。
+编译态参数`Policy`(`ATVC::ReducePolicy`, `ATVC::BroadcastPolicy`)是ATVC框架里Kernel层对部分算子模板的拓展描述,它对应算子模板类在不同场景的实例化实现。它由Tiling API计算出,并在策略分派API(`ATVC::Host::ReduceAdapter`)里将运行态的Policy结果转化为模板参数并调用该场景下的最佳模板实现来完成高效的数据计算。
以下为Reduce算子开发场景中`ATVC::ReducePolicy`参与计算的伪代码,详细过程请参考[3.2.2.3 Policy与Param的计算与传递](#3223-policy与param的计算与传递): ```cpp @@ -97,10 +97,10 @@ if (policy.patternId == 1 && policy.loopCnt == 2 && policy.loopInnerCnt == 3) { ReduceKernel<<<...>>>(...); } -// 自定义的ReduceKernel核函数内部调用了ReduceOpTemplate算子模板类, 该模板类内部实现了Policy对应的各种计算场景 +// 自定义的ReduceKernel核函数内部调用了ReduceOpTemplate算子模板类,该模板类内部实现了Policy对应的各种计算场景 template __global__ __aicore__ ReduceKernel(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam param) { - auto op = ATVC::Kernel::ReduceOpTemplate(); // 实例化算子Kernel模板, Policy作为模板参数传入 + auto op = ATVC::Kernel::ReduceOpTemplate(); // 实例化算子Kernel模板,Policy作为模板参数传入 op.Run(x, y, ¶m); // param作为运行态参数传入 } ``` @@ -157,7 +157,7 @@ __global__ __aicore__ void SinhCustom(GM_ADDR x, GM_ADDR y, ATVC::EleWiseParam p op.Run(x, y, ¶m); // 按照输入、输出、param的顺序传入Run函数中;OpTraits内部的ATVC::OpTemps将由EleWiseOpTemplate内部申请资源,开发无需关注 } ... -// 调用自定义的Kernel API, <<<>>>的BlockNum参数可通过param的TilingData获取 +// 调用自定义的Kernel API,<<<>>>的BlockNum参数可通过param的TilingData获取 SinhCustom<<>>(xDevice, yDevice, param); ... ``` @@ -185,9 +185,9 @@ struct SinhComputeFunc { }; ``` 计算模板类将在数据计算阶段被算子模板调用,因此计算模板类定义必须遵从以下约束: -1. 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如 `ATVC::OpTraits,ATVC::OpOutputs, ATVC::OpTemps>`。 +1. 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如 `ATVC::OpTraits,ATVC::OpOutputs, ATVC::OpTemps>`。 2. 开发必须完成公有仿函数`__aicore__ inline void operator()`的重载。`ATVC::Kernel::EleWiseOpTemplate`将在计算阶段调用仿函数完成计算。 -3. 开发定义的`operator()`仿函数的输入参数类型支持`AscendC::LocalTensor`以及C++其他基础数据类型。形式参数需按照`ATVC::OpInputs<>`,`ATVC::OpOutputs<>`, `ATVC::OpTemps<>`声明的顺序填入,其他标量参数放在最后,根据用户计算场景按需传入。 +3. 开发定义的`operator()`仿函数的输入参数类型支持`AscendC::LocalTensor`以及C++其他基础数据类型。形式参数需按照`ATVC::OpInputs<>`,`ATVC::OpOutputs<>`, `ATVC::OpTemps<>`声明的顺序填入,其他标量参数放在最后,根据用户计算场景按需传入。 #### 3.1.2.2 实例化模板 @@ -270,7 +270,7 @@ if (!ATVC::Host::CalcEleWiseTiling(eleNum, param)) { ### 3.1.3 Elementwise模板说明 `ATVC::Kernel::EleWiseOpTemplate`为ATVC框架提供的内置Elementwise基本算子类,它实现了一套算子数据的搬运搬出、资源分配和释放的算子流程。它需要计算模板类作为模板参数传入来完成实例化。核函数通过调用它完成整套计算逻辑:1. 资源初始化; 2.将数据从GM搬运至UB; 3.按`OpTraits`的输入、输出、临时资源描述、其他标量的顺序传入计算模板类的仿函数完成数据的基块计算; 4.将结果从UB搬出至GM。 -下方为`ATVC::Kernel::EleWiseOpTemplate`模板类的外部接口介绍,完整模板类定义请参考[`elewise_op_template头文件`](../include/elewise/elewise_op_template.h)。 +下方为`ATVC::Kernel::EleWiseOpTemplate`模板类的外部接口介绍,完整模板类定义请参考[elewise_op_template头文件](../include/elewise/elewise_op_template.h)。 ```cpp /*! * \brief EleWiseOpTemplate provides templates for element level operations on tensors, @@ -310,10 +310,10 @@ using SinhOpTraits = ATVC::OpTraits, ATVC::OpOutputs> elewiseTemplate; -// 调用EleWiseOpTemplate的Run接口传入输入x,输出y,Host::CalcEleWiseTiling API的输出param -elewiseTemplate.Run(x, y, ¶m); + // 将计算模板类模板定义作为模板参数传入 + ATVC::Kernel::EleWiseOpTemplate> elewiseTemplate; + // 调用EleWiseOpTemplate的Run接口传入输入x,输出y,Host::CalcEleWiseTiling API的输出param + elewiseTemplate.Run(x, y, ¶m); } ``` @@ -321,7 +321,7 @@ elewiseTemplate.Run(x, y, ¶m); ### 3.1.4 切分策略算法说明 下方为`ATVC::Host::CalcEleWiseTiling`函数内部计算Tiling参数的步骤,详细代码请参考[EleWise Tiling 算法](../include/elewise/elewise_host.h): -- 计算`blockNum`:计算`blockNum` = 总的元素量(`totalCnt`) / 单核数据量基线(`singleCoreBaseLine`), `blockNum`最小值为1, 最大值为平台提供的最大`vectorCore`值。 +- 计算`blockNum`:计算`blockNum` = 总的元素量(`totalCnt`) / 单核数据量基线(`singleCoreBaseLine`),`blockNum`最小值为1, 最大值为平台提供的最大`vectorCore`值。 - 计算达到UB上限的单核单输入元素个数值`ubLimitCnt`:UB上限内存大小 / 所有输入输出及temp单个元素的内存之和。 - 计算`tiledCnt`: - 计算每个核需要处理的数据元素量`avgElePerBlock = totalCnt / blockNum`。 @@ -504,7 +504,7 @@ public: ``` Reduce计算模板类将在数据计算阶段被`ReduceOpTemplate`算子模板调用,因此Reduce计算模板类的实现必须遵从以下约束: -- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 +- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 - 开发必须完成以下公有API的内部实现: 1. 计算单数据基块的Reduce结果 `__aicore__ inline void Compute(...)`。 2. 计算单UB内不同数据基块的计算结果 `__aicore__ inline void UpdateCache(...)`。 @@ -567,7 +567,7 @@ struct ReduceTilingData { struct ReduceParam { uint64_t workspaceAddr; // device侧的申请空间地址 - uint32_t workspaceSize = 0; // tiling侧需要申请的工作空间大小,单位比特 + uint32_t workspaceSize = 0; // tiling侧需要申请的工作空间大小,单位比特 ReduceTilingData tilingData; // 影响数据搬运的相关参数 uint32_t nBufferNum = 2; // 每个Queue中的Tensor数量 }; @@ -639,7 +639,7 @@ void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::Red } else if (policy == ATVC::REDUCE_POLICY3) { ReduceCustom<<>>(x, y, param); } else { - printf("[ERROR] Cannot find any matched policy.\n"); + printf("Cannot find any matched policy.\n"); } // 流同步后释放申请的param内存 CHECK_ACL(aclrtSynchronizeStream(stream)); @@ -821,12 +821,12 @@ Broadcast计算模板类将在数据计算阶段被`BroadcastOpTemplate`算子 * `inputOffset`: 表示本次compute需要计算的一段数据段位于`src`的起始位置偏移量。 * `dst`: 是存放计算后的UB上的输出数据。 * `dimA`: 表示此次计算需要处理的输入数据长度。 - * `dimB`: 表示此次计算需要扩充的长度,输入为dimA, 输出数据量为dimA*dimB。 + * `dimB`: 表示此次计算需要扩充的长度,输入为dimA,输出数据量为dimA*dimB。 - `Compute`需要完成AB和BA两种场景的计算: * AB场景的计算:输入`src`是一个Shape为(dimA, 1)的Tensor,需要将数据扩充到`dst`上,`dst`的Shape是(dimA, dimB)。 * BA场景的计算:输入`src`是一个Shape为(1, dimA)的Tensor,需要将src数据扩充到`dst`上,`dst`的Shape是(dimB, dimA)。 -- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 +- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 #### 3.3.2.2 实例化模板 在ATVC提供了Broadcast内部实现后,用户需要定义封装核函数接口。核函数内部通过`ATVC::Kernel::BroadcastOpTemplate`完成模板实例化。 @@ -857,7 +857,7 @@ __global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, ATVC::Broadcast Broadcast算子开发场景下,核函数定义必须遵从以下约束: 1. 核函数须预留一个GM_ADDR类型的形参用于传入`ATVC::BroadcastParam`运行态参数。 2. 核函数须加入`KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);`这段代码显示标注算子类型。 -3. 核函数须实例化`ATVC::Kernel::BroadcastOpTemplate`变量,实例化时需传入对应的计算实现模板类`ATVC::BroadcastCompute`,并调用它的`Run(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam* broadcastParam)`接口来实现数据的调度运算。 +3. 核函数须实例化`ATVC::Kernel::BroadcastOpTemplate`变量,实例化时需传入对应的计算实现模板类`ATVC::BroadcastCompute`,并调用它的`Run(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam* broadcastParam)`接口来实现数据的调度运算。 #### 3.3.2.3 Policy与Param的计算与传递 ##### 3.3.2.3.1 CalcBroadcastTiling @@ -887,7 +887,7 @@ struct BroadcastOpTilingData { struct BroadcastParam { uint64_t workspaceAddr; // device侧的申请空间地址 - uint32_t workspaceSize = 0; // tiling计算出来的kernel计算所需的工作空间大小,单位比特 + uint32_t workspaceSize = 0; // tiling计算出来的kernel计算所需的工作空间大小,单位比特 BroadcastOpTilingData tilingData; // 数据切分策略的相关参数 int32_t nBufferNum = 2; // 并行流水数 }; @@ -917,7 +917,7 @@ bool CalcBroadcastTiling(std::vector shapeIn, BroadcastTilingInputParam opInput = {shapeIn, shapeOut, inputDtype}; OpTiling::BroadcastOpTiling tiling(opInput, policy, param); if(!tiling.Run()) { - printf("[ERROR] Tiling Error\n"); + printf("Tiling Error\n"); return false; } return true; @@ -967,7 +967,7 @@ void BroadcastOpAdapter(uint8_t* x, uint8_t* y, ATVC::BroadcastParam ¶m, ATV }else if (policy == ATVC::BROADCAST_POLICY1) { BroadcastCustom<<>>(x, y, paramDevice); } else { - printf("[ERROR] Cannot find any matched policy.\n"); + printf("Cannot find any matched policy.\n"); } // 流同步后释放申请的param内存 CHECK_ACL(aclrtSynchronizeStream(stream)); @@ -1006,7 +1006,7 @@ struct BroadcastPolicy { }; ``` -下方为`ATVC::Kernel::BroadcastOpTemplate`模板类的外部接口介绍,完整模板类定义请参考[`broadcast_op_template.h`](../include/broadcast/broadcast_op_template.h)。 +下方为`ATVC::Kernel::BroadcastOpTemplate`模板类的外部接口介绍,完整模板类定义请参考[broadcast_op_template.h](../include/broadcast/broadcast_op_template.h)。 ```cpp #ifndef ATVC_BROADCAST_OP_TEMPLATE_H #define ATVC_BROADCAST_OP_TEMPLATE_H @@ -1086,10 +1086,10 @@ ATVC框架支持Broadcast与Elementwise组合的算子通过扩展BroadcastOpTem 4.定义Kernel层算子入口API, 内部实例化计算模板类。 -下面将以Add算子(Broadcast + Elementwise, 为区别Add单算字,命名为AddWithBroadcast算子)搭建为样例,按照组装顺序介绍组合算子类的开发流程。 +下面将以Add算子(Broadcast + Elementwise,为区别Add单算字,命名为AddWithBroadcast算子)搭建为样例,按照组装顺序介绍组合算子类的开发流程。 ### 3.4.2 组合算子开发步骤 -下面是用户利用Components模板实现自定义算子所需要实现的关键步骤,完整样例见[add_with_broadcast](..\examples\add_with_broadcast\add_with_broadcast.cpp) : +下面是用户利用Components模板实现自定义算子所需要实现的关键步骤,完整样例见[add_with_broadcast](../examples/add_with_broadcast/add_with_broadcast.cpp) : ```cpp // AddWithBroadcast算子的描述:两个输入,一个输出,类型均为float using BroadcastOpTraits = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; @@ -1238,7 +1238,7 @@ ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, & `BroadcastAdapter`的介绍可参考章节[3.3.2.3.2](#33232-broadcastopadapter)。 ### 3.4.3 BroadcastOpTemplate模板说明 -Broadcast与Elementwise组合的算子模板以BroadcastOpTemplate为基础进行扩展,`BroadcastOpTemplate`的介绍可以参考章节[3.3.3](#333-broadcast模板说明)。下面为组合算子场景`ATVC::Kernel::BroadcastOpTemplate`新引入的接口或定义,以及调用计算模板函数的示意代码,完整模板定义请参考[`broadcast_op_template.h`](../include/broadcast/broadcast_op_template.h): +Broadcast与Elementwise组合的算子模板以BroadcastOpTemplate为基础进行扩展,`BroadcastOpTemplate`的介绍可以参考章节[3.3.3](#333-broadcast模板说明)。下面为组合算子场景`ATVC::Kernel::BroadcastOpTemplate`新引入的接口或定义,以及调用计算模板函数的示意代码,完整模板定义请参考[broadcast_op_template.h](../include/broadcast/broadcast_op_template.h): ```cpp /*! @@ -1486,7 +1486,7 @@ ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, ¶m); `ReduceOpAdapter`的介绍可参考章节[3.2.2.3.2](#32232-reduceopadapter)。 ### 3.5.3 ReduceOpTemplate模板说明 -Reduce与Elementwise组合的算子模板以ReduceOpTemplate为基础进行扩展,`ReduceOpTemplate`的介绍可以参考章节[3.2.3](#323-reduce模板说明)。下面为组合算子场景`ATVC::Kernel::ReduceOpTemplate`新引入的接口或定义,以及调用计算模板函数的示意代码,完整模板定义请参考[`reduce_op_template.h`](../include/reduce/reduce_op_template.h)): +Reduce与Elementwise组合的算子模板以ReduceOpTemplate为基础进行扩展,`ReduceOpTemplate`的介绍可以参考章节[3.2.3](#323-reduce模板说明)。下面为组合算子场景`ATVC::Kernel::ReduceOpTemplate`新引入的接口或定义,以及调用计算模板函数的示意代码,完整模板定义请参考[reduce_op_template.h](../include/reduce/reduce_op_template.h): ```cpp /*! @@ -1561,7 +1561,7 @@ public: # 4 ATVC的调试调优功能 为了用户在使用ATVC进行算子开发时能快速进行精度调试和性能调优,ATVC支持多种调试调优能力。 ## 4.1 OpTraits校验接口 -用户可通过`DebugCheck()`接口校验不同模板的OpTraits功能, 接口在Host侧调用,无需额外的开关限制,接口定义如下: +用户可通过`DebugCheck()`接口校验不同模板的OpTraits功能, 接口在Host侧调用,无需额外的开关限制,接口定义如下: ```cpp namespace ATVC { namespace Host { @@ -1571,7 +1571,7 @@ bool DebugCheck() } ``` -其中,模板参数`OpTraits`是用户定义的待校验的输入输出描述信息, 模板参数`templateType`是校验规则分类的标识, 定义如下: +其中,模板参数`OpTraits`是用户定义的待校验的输入输出描述信息,模板参数`templateType`是校验规则分类的标识, 定义如下: ```cpp enum class TemplateType { ELE_WISE, // ElementWise模板的校验类型 @@ -1583,8 +1583,8 @@ DebugCheck主要校验项如下: | 模板类型 | OpTraits校验项 | | ----------- | --------------- | | ELE_WISE | 输入输出非空 | -| REDUCE | 输入输出个数均为1, 输入输出数据类型相同 | -| BROADCAST | 输入输出个数均为1, 输入输出数据类型相同 | +| REDUCE | 输入输出个数均为1,输入输出数据类型相同 | +| BROADCAST | 输入输出个数均为1,输入输出数据类型相同 | 接口使用示例: ```cpp @@ -1612,8 +1612,8 @@ DFX信息打印格式按照 [日志级别(`ERROR`/`INFO`)]:[`ATVC`][`Module`] ( namespace ATVC { namespace Kernel { template -__aicore__ inline void DebugPrintf(__gm__ const char* fmt, Args&&... args); -} + __aicore__ inline void DebugPrintf(__gm__ const char* fmt, Args&&... args); + } } // 调用示例 ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Input Count can not be 0!\n"); @@ -1636,7 +1636,7 @@ TanhGrad<<>>(dyDevice, yDevice, zDevice, ## 4.3 Tiling超参调优 ### 4.3.1 Elementwise模板算子Tiling超参调优 #### 4.3.1.1 Elementwise模板通用Tiling算法 -- 计算`blockNum`:计算`blockNum` = 总的元素量(`totalCnt`) / 单核数据量基线(`singleCoreBaseLine`), `blockNum`最小值为1, 最大值为平台提供的最大`vectorCore`值。 +- 计算`blockNum`:计算`blockNum` = 总的元素量(`totalCnt`) / 单核数据量基线(`singleCoreBaseLine`), `blockNum`最小值为1, 最大值为平台提供的最大`vectorCore`值。 - 计算达到UB上限的单核单输入元素个数值`ubLimitCnt`:UB上限内存大小 / 所有输入输出及temp单个元素的内存之和。 - 计算`tiledCnt`: - 计算每个核需要处理的数据元素量`avgElePerBlock = totalCnt / blockNum`。 @@ -1704,7 +1704,7 @@ bool CalcEleWiseTiling(int32_t totalCnt, ATVC::EleWiseParam ¶m, ATVC::Host::EleWiseTilingHyperParam hyperParam; hyperParam.singleCoreBaseLine = 1024; if (!ATVC::Host::CalcEleWiseTiling(eleNum, param, hyperParam=hyperParam)) { - printf("[ERROR]: Calculate eleWise tiling failed.\n"); + printf("Calculate eleWise tiling failed.\n"); return -1; }; ``` @@ -1772,4 +1772,4 @@ bool CalcReduceTiling(std::vector inputShape, } } ``` -其中,可选参数`hyperParam`在未传入用户自定义超参时,使用`ReduceTilingHyperParam`的默认值,若用户需要修改某个超参,可自定义`ReduceTilingHyperParam`后传入。 +其中,可选参数`hyperParam`在未传入用户自定义超参时,使用`ReduceTilingHyperParam`的默认值,若用户需要修改某个超参,可自定义`ReduceTilingHyperParam`后传入。 diff --git a/atvc/examples/README.md b/atvc/examples/README.md index ab2d58af4b5793a3f8b62f2aecb201154dff5b3d..aa5c513aa81ae5bed49ddebc7b2cb5cb59675e1c 100644 --- a/atvc/examples/README.md +++ b/atvc/examples/README.md @@ -1,12 +1,12 @@ # 样例介绍 | 样例名 | 描述 | 模板 | 算子调用方式| | ------------------------------------------------------------ | ------------------------------------------------------------- | --------------- | ------------------ | -| [add](./add/add.cpp) | 使用ATVC的Elementwise模板实现Add算子以及调用样例 | Elementwise | Kernel直调 | -| [sinh_custom](./sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例 | Elementwise | Kernel直调 | -| [add_with_scalar](./add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Elementwise类算子以及调用样例 | Elementwise | Kernel直调 | -| [reduce_sum](./reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例 | Reduce | Kernel直调 | -| [broadcast_to](./broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例 | Broadcast | Kernel直调 | -| [tanh_grad](./tanh_grad/tanh_grad.cpp) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例 | Elementwise | Kernel直调 | +| [add](./add) | 使用ATVC的Elementwise模板实现Add算子以及调用样例 | Elementwise | Kernel直调 | +| [sinh_custom](./sinh_custom) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例 | Elementwise | Kernel直调 | +| [add_with_scalar](./add_with_scalar) | 输入带标量的自定义Elementwise类算子以及调用样例 | Elementwise | Kernel直调 | +| [reduce_sum](./reduce_sum) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例 | Reduce | Kernel直调 | +| [broadcast_to](./broadcast_to) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例 | Broadcast | Kernel直调 | +| [tanh_grad](./tanh_grad) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例 | Elementwise | Kernel直调 | | [ops_aclnn](./ops_aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例 | | 单算子API调用 | | [ops_pytorch](./ops_pytorch) | 使用ATVC开发自定义算子,并实现从[PyTorch](https://gitee.com/ascend/pytorch)框架调用的样例 | | PyTorch框架调用 | | [add_with_broadcast](./add_with_broadcast) |使用ATVC的Elementwise和Broadcast组合模板实现Add算子以及调用样例 | Broadcast | Kernel直调 | diff --git a/atvc/examples/add/add.cpp b/atvc/examples/add/add.cpp index 1bb6c217172d1d7683fd23ae660f818b5e051914..18299e728b00a5d2a7525d6e8f4ba8943820dc67 100644 --- a/atvc/examples/add/add.cpp +++ b/atvc/examples/add/add.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -81,7 +82,7 @@ __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, ATVC::EleW int main() { if (!ATVC::Host::DebugCheck()) { - printf("[ERROR]: Element wise OpTraits check failed.\n"); + printf("Element wise OpTraits check failed.\n"); return -1; } // totalCnt描述EleWise单输入的元素个数 @@ -101,7 +102,7 @@ int main() ATVC::EleWiseParam param; if (!ATVC::Host::CalcEleWiseTiling(eleNum, param)) { - printf("[ERROR]: Calculate Element wise tiling Failed.\n"); + printf("Calculate Element wise tiling Failed.\n"); return -1; }; auto elementParamSize = sizeof(param); @@ -133,6 +134,6 @@ int main() if (!VerifyResults(golden, outputZ)) { return -1; } - printf("[INFO]: Accuracy verification passed.\n"); + printf("Accuracy verification passed.\n"); return 0; } \ No newline at end of file diff --git a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp index 75e8a246b584687d1c38091db6e11df0eb568438..10342668ef4f19eee775b3371d958629ced33d42 100644 --- a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp +++ b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -39,7 +40,7 @@ void BroadcastOpAdapter(uint8_t* x, uint8_t* y, uint8_t* z, ATVC::BroadcastParam }else if (policy == ATVC::BROADCAST_POLICY1) { AddWithBroadcastCustom<<>>(x, y, z, param); } else { - printf("[ERROR] Cannot find any matched policy.\n"); + printf("Cannot find any matched policy.\n"); } // 流同步后释放申请的param内存 CHECK_ACL(aclrtSynchronizeStream(stream)); diff --git a/atvc/examples/add_with_broadcast/add_with_broadcast.h b/atvc/examples/add_with_broadcast/add_with_broadcast.h index 3613ce33d499d348933427768e4daf6d21b32e24..078dabdc25fce052bc5b6f3f8e2f50beb35a7fa2 100644 --- a/atvc/examples/add_with_broadcast/add_with_broadcast.h +++ b/atvc/examples/add_with_broadcast/add_with_broadcast.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/add_with_broadcast/post_compute_add_of_broadcast.h b/atvc/examples/add_with_broadcast/post_compute_add_of_broadcast.h index 4369ed0ce2f37ba1208ae61ad9e1f12115f8136f..df2d6e18f937a3c6888b6671f9b29c859649ca77 100644 --- a/atvc/examples/add_with_broadcast/post_compute_add_of_broadcast.h +++ b/atvc/examples/add_with_broadcast/post_compute_add_of_broadcast.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/add_with_scalar/add_with_scalar.cpp b/atvc/examples/add_with_scalar/add_with_scalar.cpp index 1245001e90c3fd70c7d740a432ee46b6573e77a7..41ce26a522d7edb9cde4a38ca432cf0144e15297 100644 --- a/atvc/examples/add_with_scalar/add_with_scalar.cpp +++ b/atvc/examples/add_with_scalar/add_with_scalar.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/broadcast_to/broadcast_to.cpp b/atvc/examples/broadcast_to/broadcast_to.cpp index e4460791a3c00ab83f4f071c2407fd65a66caf35..e4f92f55aadceaf40d4d92e7c3c9d5593b7ea9bc 100644 --- a/atvc/examples/broadcast_to/broadcast_to.cpp +++ b/atvc/examples/broadcast_to/broadcast_to.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -60,7 +61,7 @@ void BroadcastOpAdapter(uint8_t* x, uint8_t* y, ATVC::BroadcastParam ¶m, ATV }else if (policy == ATVC::BROADCAST_POLICY1) { BroadcastCustom<<>>(x, y, param); } else { - printf("[ERROR]: Cannot find any matched policy.\n"); + printf("Cannot find any matched policy.\n"); } // 流同步后释放申请的param内存 CHECK_ACL(aclrtSynchronizeStream(stream)); @@ -71,7 +72,7 @@ void BroadcastOpAdapter(uint8_t* x, uint8_t* y, ATVC::BroadcastParam ¶m, ATV int32_t main(int32_t argc, char* argv[]) { if (!ATVC::Host::DebugCheck()) { - printf("[ERROR]: Broadcast opTraits check failed.\n"); + printf("Broadcast opTraits check failed.\n"); return -1; } int32_t eleNum = 1 * 1024; diff --git a/atvc/examples/common/example_common.h b/atvc/examples/common/example_common.h index 5c4b6399d405936c292d7d1f4de00047aed4e4ce..fae0bdbe2b01812a8c3d1e1dbad0ef223c5e9d8a 100644 --- a/atvc/examples/common/example_common.h +++ b/atvc/examples/common/example_common.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -35,7 +36,7 @@ bool VerifyResults(const std::vector &golden, const std::vector &o { for (int32_t i = 0; i < golden.size(); i++) { if (!IsClose(golden[i], output[i])) { - printf("[ERROR]: Accuracy verification failed! The expected value of element " + printf("Accuracy verification failed! The expected value of element " "in index [%d] is %f, but actual value is %f.\n", i, golden[i], diff --git a/atvc/examples/ops_aclnn/README.md b/atvc/examples/ops_aclnn/README.md index 79c609e6d8551f87bf5d6f769b1761fd0c386276..1aa569d725b57e54f76396023f1f15ac482f3fbc 100644 --- a/atvc/examples/ops_aclnn/README.md +++ b/atvc/examples/ops_aclnn/README.md @@ -58,9 +58,9 @@ ops_aclnn/ 将[func.cmake](./add/AddCustom/cmake/func.cmake)、[intf.cmake](./add/AddCustom/cmake/intf.cmake)、host侧的[CMakeLists.txt](./add/AddCustom/op_host/CMakeLists.txt)和kernel侧的[CMakeLists.txt](./add/AddCustom/op_kernel/CMakeLists.txt)分别复制到`步骤1`生成的工程文件的对应目录下。 - 2.2 修改对应的host文件 - - 引入对应的头文件,修改对应TilingFunc函数中tiling的生成,根据算子类型调用不同的tiling生成策略,更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。 + - 引入对应的头文件,修改对应TilingFunc函数中tiling的生成,根据算子类型调用不同的tiling生成策略,更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。 - elewise类,参考[add_custom.cpp](./add/AddCustom/op_host/add_custom.cpp) + Elementwise类,参考[add_custom.cpp](./add/AddCustom/op_host/add_custom.cpp) ```cpp // 引入头文件 #include "elewise/elewise_host.h" @@ -90,7 +90,7 @@ ops_aclnn/ ... ``` - broadcast类 + Broadcast类 ```cpp #include "broadcast/broadcast_host.h" // 定义算子描述 @@ -117,7 +117,7 @@ ops_aclnn/ } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) { (void)ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, tiling); } - // 根据不同的policy设置不同的tilingkey,在kernel侧根据不同的tilingkey进行调用不同的算子模版 + // 根据不同的policy设置不同的tilingkey,在kernel侧根据不同的tilingkey进行调用不同的算子模版 if (policy == ATVC::BROADCAST_POLICY0) { context->SetTilingKey(0); } else if (policy == ATVC::BROADCAST_POLICY1) { @@ -129,7 +129,7 @@ ops_aclnn/ currentWorkspace[0] = 0; ``` - reduce_sum类,参考[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp) + ReduceSum类,参考[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp) ```cpp // 引入头文件 #include "reduce/reduce_host.h" @@ -169,7 +169,7 @@ ops_aclnn/ 用户需要通过AscendC API来搭建Add算子的核心计算逻辑,在ATVC框架中,这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数,并重载`operator()`来被提供的Kernel层算子模板类调用,更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。 - elewise类[add_custom.cpp](./add/AddCustom/op_kernel/add_custom.cpp) + Elementwise类[add_custom.cpp](./add/AddCustom/op_kernel/add_custom.cpp) ```cpp // 头文件引入 #include "elewise/elewise_device.h" @@ -192,7 +192,7 @@ ops_aclnn/ template // 重载operator,提供给算子模板类调用 __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) { - AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过z.GetSize()获取单次计算的元素数量 + AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑,通过z.GetSize()获取单次计算的元素数量 } }; @@ -203,7 +203,7 @@ ops_aclnn/ auto op = ATVC::Kernel::EleWiseOpTemplate>(); op.Run(x, y, z, ¶m); ``` - broadcast类 + Broadcast类 ```cpp // 头文件引入 #include "broadcast/broadcast_device.h" @@ -224,7 +224,7 @@ ops_aclnn/ ... } ``` - reduce_sum类[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp) + ReduceSum类[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp) ```cpp // 头文件引入 #include "reduce/reduce_device.h" @@ -257,7 +257,7 @@ ops_aclnn/ ### 步骤4. 部署自定义算子包 - 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH。 ```bash echo $ASCEND_OPP_PATH diff --git a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/README.md b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/README.md index de9e6772fbde260270c8d215aa2fb516120693b7..166fca255756800a0a7bff18bc314489e4699fc4 100644 --- a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/README.md +++ b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/README.md @@ -39,8 +39,3 @@ ```bash bash run.sh ``` - -## 更新说明 -| 时间 | 更新事项 | -| ---------- | ------------ | -| 2025/07/22 | 新增本readme | \ No newline at end of file diff --git a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/main.cpp b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/main.cpp index 2eaea333b99126fb45ae6d7620b5ef9ab8e922c9..c21466f0aaf00b6ba873ab607e7fcd517c2ee564 100644 --- a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/main.cpp +++ b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/main.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh index 828e573b71417f45531925bacd608458b5e3c499..6eaa830fbfd40ffb42abbdd8466f3a702e863829 100644 --- a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh +++ b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh @@ -1,4 +1,5 @@ #!/bin/bash +# This program is free software, you can redistribute it and/or modify it. # Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). diff --git a/atvc/examples/ops_aclnn/add/AddCustom/op_host/add_custom.cpp b/atvc/examples/ops_aclnn/add/AddCustom/op_host/add_custom.cpp index c20ed6745c205df83e22552ccc761fca9a54e814..6c3ed6bc1795215d9262be1f100bf35321ae34b5 100644 --- a/atvc/examples/ops_aclnn/add/AddCustom/op_host/add_custom.cpp +++ b/atvc/examples/ops_aclnn/add/AddCustom/op_host/add_custom.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/add_custom.cpp b/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/add_custom.cpp index 74eb30ccb409fadd3455c23e9af918544bfa427c..e01ebc16b7168d2f7f5125081ad1c8e167397ebc 100644 --- a/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/add_custom.cpp +++ b/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/add_custom.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_aclnn/add/README.md b/atvc/examples/ops_aclnn/add/README.md index 7574702dc8384e56d19481b34762e893bb358309..11bea4fcc7ea343118078e63caab36720661eece 100644 --- a/atvc/examples/ops_aclnn/add/README.md +++ b/atvc/examples/ops_aclnn/add/README.md @@ -57,18 +57,18 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 ### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 - - 导入ATVC环境变量 + - 导入ATVC环境变量。 ```bash # 如果不导入,默认使用./atvc/include路径 export ATVC_PATH=${atvc}/include ``` - - 切换到msOpGen脚本install.sh所在目录 + - 切换到msOpGen脚本install.sh所在目录。 ```bash # 若开发者以git命令行方式clone了master分支代码,并切换目录 cd atvc/examples/ops_aclnn/add ``` - - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子。 运行install.sh脚本 ```bash @@ -82,7 +82,7 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 ### 3. 部署自定义算子包 -- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH +- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH。 ```bash echo $ASCEND_OPP_PATH # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp @@ -92,9 +92,9 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash ``` 参数说明: - - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致。 -- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 +- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包。 ```bash cd CustomOp/build_out ./custom_opp__.run @@ -104,7 +104,3 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 ### 4. 调用执行算子工程 - [aclnn调用AddCustom算子工程](./AclNNInvocationNaive/README.md) -## 更新说明 -| 时间 | 更新事项 | -| ---------- | ---------------------------- | -| 2025/07/24 | 新增readme | diff --git a/atvc/examples/ops_aclnn/add/install.sh b/atvc/examples/ops_aclnn/add/install.sh index 69da0e25d69a7e8d750a3ef60ec1746c632c4593..c0f155309e1d5354069d8742fd62e142aefc5f35 100644 --- a/atvc/examples/ops_aclnn/add/install.sh +++ b/atvc/examples/ops_aclnn/add/install.sh @@ -1,5 +1,6 @@ #!/bin/bash -# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# This program is free software, you can redistribute it and/or modify it. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/README.md b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/README.md index 239ea433571ebe3f43e00a333dee6b8fe876f99c..9851b1cf389ead0a15eb9418ceda8a0abc832acf 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/README.md +++ b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/README.md @@ -39,8 +39,3 @@ ```bash bash run.sh ``` - -## 更新说明 -| 时间 | 更新事项 | -| ---------- | ------------ | -| 2025/07/22 | 新增本readme | \ No newline at end of file diff --git a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/main.cpp b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/main.cpp index 1dce25b668f2436077631338b228db5dcb6d7674..f43a219e907d0464d76fdbfa7cc95ed5473c8824 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/main.cpp +++ b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/main.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh index 96b90851c2d4fcae579b8cd590308e895cbd9ce8..86543e401475be1ff2370823dc507a39bf3c6df1 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh +++ b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh @@ -1,4 +1,5 @@ #!/bin/bash +# This program is free software, you can redistribute it and/or modify it. # Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). diff --git a/atvc/examples/ops_aclnn/reduce_sum/README.md b/atvc/examples/ops_aclnn/reduce_sum/README.md index 54485bd665aa562b8e358c9114c2ad95949abccf..92bb9743a2fdb20c0ea8884cefe589942c5e89a0 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/README.md +++ b/atvc/examples/ops_aclnn/reduce_sum/README.md @@ -99,8 +99,3 @@ CANN软件包中提供了工程创建工具msOpGen,ReduceSumCustom算子工程 ### 4. 调用执行算子工程 - [aclnn调用ReduceSumCustom算子工程](./AclNNInvocationNaive/README.md) - -## 更新说明 -| 时间 | 更新事项 | -| ---------- | ---------------------------- | -| 2025/07/24 | 新增readme | diff --git a/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp b/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp index 0bfebe90c1fa1bc4ce7ef05d684573668e447b4e..573d0e24ef28bbc5a779b12f5ccda7a9dfbdb1e3 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp +++ b/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp b/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp index 9bce6c8e75c9f933a2e44fede2403db84ed66889..2bd076fdbbecae6fde9af0e37423cee240fe0b09 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp +++ b/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_aclnn/reduce_sum/install.sh b/atvc/examples/ops_aclnn/reduce_sum/install.sh index f170159d2fe429531a6d78f522e49e7f8c5a1a4e..5e65a87adba36ea23f885d56fb7bf482b0b3e552 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/install.sh +++ b/atvc/examples/ops_aclnn/reduce_sum/install.sh @@ -1,5 +1,6 @@ #!/bin/bash -# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# This program is free software, you can redistribute it and/or modify it. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_pytorch/README.md b/atvc/examples/ops_pytorch/README.md index 2a71783eef06cddcae696a0725cc255a251aeb05..7226a947ed8074e4f1609bda8b4f2dbe90da5ce4 100644 --- a/atvc/examples/ops_pytorch/README.md +++ b/atvc/examples/ops_pytorch/README.md @@ -27,7 +27,7 @@ ops_pytorch/ 不同的算子类型可参考[快速入门](../../docs/01_quick_start.md)中的模版选择模版进行选择,用户在此处通过`<<<>>>`的方式调用核函数,更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。 -### 步骤1. 引入头文件。需要注意的是,需要保护对应核函数调用接口声明所在的头文件{kernel_name}_impl.h,kernel_name为算子的核函数名称。 +### 步骤1. 引入头文件。需要注意的是,需要保护对应核函数调用接口声明所在的头文件{kernel_name}_impl.h,kernel_name为算子的核函数名称。 ```cpp // 头文件引入 #include @@ -37,8 +37,8 @@ ops_pytorch/ ### 步骤2. 应用程序框架编写,需要注意的是,本样例输入x,y的内存是在python调用脚本[run_op.py](./add/run_op.py)中分配的。 ```cpp namespace ascendc_elewise_ops { - at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) { - } + at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) { + } } ``` ### 步骤3. NPU侧运行验证。通过`<<<>>>`的方式调用核函数完成指定的运算。 diff --git a/atvc/examples/ops_pytorch/add/README.md b/atvc/examples/ops_pytorch/add/README.md index 90c2fe5e7672317b92a5fd98ec289a43a81892c4..bdd35b6bd989db4de2e574457fef1e68cfbd0792 100644 --- a/atvc/examples/ops_pytorch/add/README.md +++ b/atvc/examples/ops_pytorch/add/README.md @@ -39,7 +39,7 @@ z = x + y ### 1. 获取源码包及环境配置 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 ### 2. 安装PyTorch环境 -参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境 +参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境。 ### 3. 基于ATVC编写PyTorch算子的实现 - 算子kernel侧实现 @@ -139,11 +139,3 @@ z = x + y ```bash cd ./atvc/examples/ops_pytorch/add bash run.sh - ... - OK - ``` - -## 更新说明 -| 时间 | 更新事项 | -| ---------- | ---------------------------- | -| 2025/07/24 | 新增readme | diff --git a/atvc/examples/ops_pytorch/add/add_custom_impl.h b/atvc/examples/ops_pytorch/add/add_custom_impl.h index 62eea16a0beb47e2868fceeada387fb77f35ab67..f2029d5bbb2e001d4e8ee26a61e60d3c3a432c57 100644 --- a/atvc/examples/ops_pytorch/add/add_custom_impl.h +++ b/atvc/examples/ops_pytorch/add/add_custom_impl.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_pytorch/add/pytorch_ascendc_extension.cpp b/atvc/examples/ops_pytorch/add/pytorch_ascendc_extension.cpp index f7b538067cedad7dc3c5215ed8ad2b91910de669..86cc2b50ea2b27bb26a5cfc3600b61561afe0e99 100644 --- a/atvc/examples/ops_pytorch/add/pytorch_ascendc_extension.cpp +++ b/atvc/examples/ops_pytorch/add/pytorch_ascendc_extension.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_pytorch/add/run.sh b/atvc/examples/ops_pytorch/add/run.sh index f08ce377999b7c644f6d33a11239d6ee9113f897..d5c120d4238ffb685a70cc1f6681b010dffa9f1c 100644 --- a/atvc/examples/ops_pytorch/add/run.sh +++ b/atvc/examples/ops_pytorch/add/run.sh @@ -1,5 +1,6 @@ #!/bin/bash -# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# This program is free software, you can redistribute it and/or modify it. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_pytorch/add/run_op.py b/atvc/examples/ops_pytorch/add/run_op.py index 52ee702116e83f8f13ec22406c3090f1e17529ae..830d59a48a558b660f10bc395d6edbb5fa04942d 100644 --- a/atvc/examples/ops_pytorch/add/run_op.py +++ b/atvc/examples/ops_pytorch/add/run_op.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. -# +# This program is free software, you can redistribute it and/or modify it. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. -# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. diff --git a/atvc/examples/ops_pytorch/reduce_sum/README.md b/atvc/examples/ops_pytorch/reduce_sum/README.md index 2d3cdbc046d511b76794215cc1b4def31c3c9d47..2fd67d6fef81711c32a1f379074ebac30d1d42d4 100644 --- a/atvc/examples/ops_pytorch/reduce_sum/README.md +++ b/atvc/examples/ops_pytorch/reduce_sum/README.md @@ -36,7 +36,7 @@ ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结 ### 1. 获取源码包及环境配置 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 ### 2. 安装PyTorch环境 - 参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境 + 参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境。 ### 3. 基于ATVC编写PyTorch算子的实现 - 算子kernel侧实现 @@ -95,7 +95,7 @@ ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结 } // namespace reduce } // namespace AscendC ``` - - 编写python调用函数,并调用PyTorch入口函数,参考[run_op.py](./run_op.py) + - 编写python调用函数,并调用PyTorch入口函数,参考[run_op.py](./run_op.py) ```python # 引入头文件 @@ -166,12 +166,5 @@ ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结 - 调用脚本,生成PyTorch算子,并运行测试用例 ```bash cd ./atvc/examples/ops_pytorch/reduce_sum - bash run.sh - ... - OK - ``` - -## 更新说明 -| 时间 | 更新事项 | -| ---------- | ---------------------------- | -| 2025/07/24 | 新增readme | + bash run.sh + diff --git a/atvc/examples/ops_pytorch/reduce_sum/pytorch_ascendc_extension.cpp b/atvc/examples/ops_pytorch/reduce_sum/pytorch_ascendc_extension.cpp index 17500d18f92705f705d9a0fdb0fc6a54444fc10b..d7a4bdd5b3b0d26090129587fbbca233ad1fb127 100644 --- a/atvc/examples/ops_pytorch/reduce_sum/pytorch_ascendc_extension.cpp +++ b/atvc/examples/ops_pytorch/reduce_sum/pytorch_ascendc_extension.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_pytorch/reduce_sum/reduce_sum_impl.h b/atvc/examples/ops_pytorch/reduce_sum/reduce_sum_impl.h index ac90494e06e3ce6dd997c7b84b30d8c48f063cf9..0180bf0569ea5a1d939579836b517de638976529 100644 --- a/atvc/examples/ops_pytorch/reduce_sum/reduce_sum_impl.h +++ b/atvc/examples/ops_pytorch/reduce_sum/reduce_sum_impl.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/ops_pytorch/reduce_sum/run.sh b/atvc/examples/ops_pytorch/reduce_sum/run.sh index d0d27305d23756de9ada19d33f66e976957434a2..d5c120d4238ffb685a70cc1f6681b010dffa9f1c 100644 --- a/atvc/examples/ops_pytorch/reduce_sum/run.sh +++ b/atvc/examples/ops_pytorch/reduce_sum/run.sh @@ -1,4 +1,5 @@ #!/bin/bash +# This program is free software, you can redistribute it and/or modify it. # Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). diff --git a/atvc/examples/ops_pytorch/reduce_sum/run_op.py b/atvc/examples/ops_pytorch/reduce_sum/run_op.py index 67c5e0ad5fb7db1d4970717dcb533d975bb54229..f1ec470120e537073b6788c9be07db9a0d6d082c 100644 --- a/atvc/examples/ops_pytorch/reduce_sum/run_op.py +++ b/atvc/examples/ops_pytorch/reduce_sum/run_op.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. -# +# This program is free software, you can redistribute it and/or modify it. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/reduce_max/README.md b/atvc/examples/reduce_max/README.md new file mode 100644 index 0000000000000000000000000000000000000000..49ff5939d981ad883ffac6b29cf4c069ec8e7171 --- /dev/null +++ b/atvc/examples/reduce_max/README.md @@ -0,0 +1,41 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMax单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_max.cpp](./reduce_max.cpp) | ReduceMax算子代码实现以及调用样例 | + +## 算子描述 + +ReduceMax是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceMax算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceMax
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +cd ./atvc/examples +bash run_examples.sh reduce_max \ No newline at end of file diff --git a/atvc/examples/reduce_max/reduce_max.cpp b/atvc/examples/reduce_max/reduce_max.cpp new file mode 100644 index 0000000000000000000000000000000000000000..660e3f85d238549c0a66b9e757059450a50eb8a5 --- /dev/null +++ b/atvc/examples/reduce_max/reduce_max.cpp @@ -0,0 +1,178 @@ +/** + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom( + GM_ADDR x, + GM_ADDR y, + ATVC::ReduceParam reduceParam +) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} + +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + (void)printf("Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + (void)printf("Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + (void)printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + (void)printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + (void)printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/examples/reduce_min/README.md b/atvc/examples/reduce_min/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd8a24248f1ed6a69523ad83e0848e22f7c41822 --- /dev/null +++ b/atvc/examples/reduce_min/README.md @@ -0,0 +1,41 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMin单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_min.cpp](./reduce_min.cpp) | ReduceMin算子代码实现以及调用样例 | + +## 算子描述 + +ReduceMin是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceMin算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceMin
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +cd ./atvc/examples +bash run_examples.sh reduce_min \ No newline at end of file diff --git a/atvc/examples/reduce_min/reduce_min.cpp b/atvc/examples/reduce_min/reduce_min.cpp new file mode 100644 index 0000000000000000000000000000000000000000..83d301504c8bf588bdc79096584711ec76082e7d --- /dev/null +++ b/atvc/examples/reduce_min/reduce_min.cpp @@ -0,0 +1,178 @@ +/** + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom( + GM_ADDR x, + GM_ADDR y, + ATVC::ReduceParam reduceParam +) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} + +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + (void)printf("Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + (void)printf("Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + (void)printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + (void)printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + (void)printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/examples/reduce_sum/reduce_sum.cpp b/atvc/examples/reduce_sum/reduce_sum.cpp index 9fb9bf4b7c709b9bbfc79c2720569b763add9904..0a7a4aa7b60aefb6da9efd39781fae63f5686da0 100644 --- a/atvc/examples/reduce_sum/reduce_sum.cpp +++ b/atvc/examples/reduce_sum/reduce_sum.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -102,7 +103,7 @@ void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::Red } else if (policy == ATVC::REDUCE_POLICY22) { ReduceCustom<<>>(x, y, param); } else { - printf("[ERROR]: Cannot find any matched policy.\n"); + printf("Cannot find any matched policy.\n"); } // 流同步后释放申请的param内存 CHECK_ACL(aclrtSynchronizeStream(stream)); @@ -113,7 +114,7 @@ void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::Red int32_t main(int32_t argc, char* argv[]) { if (!ATVC::Host::DebugCheck()) { - printf("[ERROR]: Reduce OpTraits check failed.\n"); + printf("Reduce OpTraits check failed.\n"); return -1; } int32_t eleNum = 8 * 1024; diff --git a/atvc/examples/relu_with_reduce_sum/post_compute_relu_with_reduce_sum.h b/atvc/examples/relu_with_reduce_sum/post_compute_relu_with_reduce_sum.h index 874b1dba96e3ee2283c2b1bde767fbf4375a7d35..61a6f4bc899e99c3b5be2458acfaa87b1dfcd80c 100644 --- a/atvc/examples/relu_with_reduce_sum/post_compute_relu_with_reduce_sum.h +++ b/atvc/examples/relu_with_reduce_sum/post_compute_relu_with_reduce_sum.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/relu_with_reduce_sum/pre_compute_add_with_reduce_sum.h b/atvc/examples/relu_with_reduce_sum/pre_compute_add_with_reduce_sum.h index 382e019b26c1cf92f4771899f06239438687e021..394419581199d403ce1cf384d5fcb4d59d749a47 100644 --- a/atvc/examples/relu_with_reduce_sum/pre_compute_add_with_reduce_sum.h +++ b/atvc/examples/relu_with_reduce_sum/pre_compute_add_with_reduce_sum.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.cpp b/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.cpp index 595e6ddd91d020446b4cd8e87ed73617b5c7d197..35e282e51b5e855db009610d0467e958dcc27c2e 100644 --- a/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.cpp +++ b/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -112,7 +113,7 @@ void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) int32_t main(int32_t argc, char *argv[]) { if (!ATVC::Host::DebugCheck()) { - printf("[ERROR]: Reduce OpTraits check failed.\n"); + printf("Reduce OpTraits check failed.\n"); return -1; } int32_t eleNum = 8 * 1024; diff --git a/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.h b/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.h index 117f27ed7e939c50751e98484fcb3ad9bbac05eb..e4e4c657fc59aeae4b42140a41daa15d828b6537 100644 --- a/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.h +++ b/atvc/examples/relu_with_reduce_sum/relu_with_reduce_sum.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/run_examples.sh b/atvc/examples/run_examples.sh index 4cd360339c5c431da4997ff35fd450529d2db45f..358b9dc741c181397195efe19dcf31773298cd0c 100644 --- a/atvc/examples/run_examples.sh +++ b/atvc/examples/run_examples.sh @@ -1,5 +1,6 @@ #!/bin/bash -# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# This program is free software, you can redistribute it and/or modify it. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. # This file is a part of the CANN Open Software. # Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. @@ -63,7 +64,7 @@ function compile_operator(){ ${COMPILE_TOOL} -x cce -cce-enable-plugin --cce-aicore-arch=dav-c220 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -ltiling_api -lplatform -lm -ldl -L${_ASCEND_INSTALL_PATH}/lib64 -DATVC_DEBUG_MODE=1 elif [ "$RUN_MODE" = "profiling" ]; then echo "Executing with profiling mode" - ${COMPILE_TOOL} -x cce -cce-enable-plugin --cce-aicore-arch=dav-c220 -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -ltiling_api -lplatform -lm -ldl -L${_ASCEND_INSTALL_PATH}/lib64 -DATVC_DEBUG_MODE=2 + ${COMPILE_TOOL} -x cce -cce-enable-plugin --cce-aicore-arch=dav-c220 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -ltiling_api -lplatform -lm -ldl -L${_ASCEND_INSTALL_PATH}/lib64 -DATVC_DEBUG_MODE=2 else echo "--npu-mode is an optional parameter and can be left unset. If set, the value must be debug_print or profiling." echo "Execution example: 'bash run_examples.sh $TEST_NAME --run-mode=debug_print'" diff --git a/atvc/examples/sinh_custom/sinh_custom.cpp b/atvc/examples/sinh_custom/sinh_custom.cpp index 3f0b85eba066e0904826e9606e91883d8173fe27..e6d317e426290151b6ea3b8069cf90af3bcc1cc1 100644 --- a/atvc/examples/sinh_custom/sinh_custom.cpp +++ b/atvc/examples/sinh_custom/sinh_custom.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/examples/tanh_grad/tanh_grad.cpp b/atvc/examples/tanh_grad/tanh_grad.cpp index d631a14064f37880931d332ea3ecc9368aac5d72..6d6838a582639c41891da5c68dd293aa8bae4b80 100644 --- a/atvc/examples/tanh_grad/tanh_grad.cpp +++ b/atvc/examples/tanh_grad/tanh_grad.cpp @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -106,7 +107,7 @@ __global__ __aicore__ void TanhGrad(GM_ADDR dy, GM_ADDR y, GM_ADDR z, ATVC::EleW int main() { if (!ATVC::Host::DebugCheck()) { - printf("[ERROR]: ElementWise OpTraits check failed.\n"); + printf("ElementWise OpTraits check failed.\n"); return -1; } // totalCnt描述EleWise单输入的元素个数 @@ -124,7 +125,7 @@ int main() ATVC::Host::EleWiseTilingHyperParam hyperParam; hyperParam.singleCoreBaseLine = 1024; // set base count for single core 为1024. if (!ATVC::Host::CalcEleWiseTiling(eleNum, param, hyperParam=hyperParam)) { - printf("[ERROR]: Calculate eleWise tiling failed.\n"); + printf("Calculate eleWise tiling failed.\n"); return -1; }; aclrtContext context; diff --git a/atvc/include/atvc.h b/atvc/include/atvc.h index 066054c24377ac5d4c2ace93c3507602dbd69e7c..f379d2035dcc58add8f03bd39a150758d1d3879c 100644 --- a/atvc/include/atvc.h +++ b/atvc/include/atvc.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -28,6 +29,8 @@ #include "common/kernel_utils.h" #include "elewise/elewise_op_template.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #include "broadcast/broadcast_compute.h" #include "broadcast/broadcast_op_template.h" diff --git a/atvc/include/broadcast/broadcast_compute.h b/atvc/include/broadcast/broadcast_compute.h index 58d23a5632b50d49219e9ae765211770007b82c5..7d3bb12059359e17707c2e8ae194fe8d99ef57fd 100644 --- a/atvc/include/broadcast/broadcast_compute.h +++ b/atvc/include/broadcast/broadcast_compute.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/broadcast_device.h b/atvc/include/broadcast/broadcast_device.h index 4165c7fd907be7eb4fe81c07263b7b56fe6258d8..21364d6a1ee687af5ffa542a4dbb7553d6fe7df7 100644 --- a/atvc/include/broadcast/broadcast_device.h +++ b/atvc/include/broadcast/broadcast_device.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/broadcast_host.h b/atvc/include/broadcast/broadcast_host.h index d025e717650b761becec49621c1c2e960e6daf22..6fa9b3c26be6cfa1c4e060b769eaea7fb589a361 100644 --- a/atvc/include/broadcast/broadcast_host.h +++ b/atvc/include/broadcast/broadcast_host.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h index e00c9c5f55c9065caf25d8b7557d5b46cc56e550..1dfb50b38a582fabc290f9064063a03f7a5aa6e0 100644 --- a/atvc/include/broadcast/broadcast_op_template.h +++ b/atvc/include/broadcast/broadcast_op_template.h @@ -1,6 +1,6 @@ - /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/common/broadcast_common.h b/atvc/include/broadcast/common/broadcast_common.h index d2f237ca5f849eb66fb31b8c7b1d7fe4a5209be5..33408dd2ae365d17d2a76f16fc3303e46e5a4bd5 100644 --- a/atvc/include/broadcast/common/broadcast_common.h +++ b/atvc/include/broadcast/common/broadcast_common.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/common/patterns.h b/atvc/include/broadcast/common/patterns.h index 57040a6b69e2f86fbc3cc859d109cf5b57aeddbe..5132acd216b0cfbda7f532bb00a6f3c54c452284 100644 --- a/atvc/include/broadcast/common/patterns.h +++ b/atvc/include/broadcast/common/patterns.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/tiling/broadcast_tiling.h b/atvc/include/broadcast/tiling/broadcast_tiling.h index 9cac7e523a02077bc029ad31495ed0a80fe2095c..a0981bffde813ab284e2ff237ff05fa29463ce25 100644 --- a/atvc/include/broadcast/tiling/broadcast_tiling.h +++ b/atvc/include/broadcast/tiling/broadcast_tiling.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/utils/broadcast_buf_pool.h b/atvc/include/broadcast/utils/broadcast_buf_pool.h index 8fb5a60f0e6689d5e7fb4f9edf4790be94c815fd..9fd9ed8d7ac9a5883ccf56ba47094164f418a325 100644 --- a/atvc/include/broadcast/utils/broadcast_buf_pool.h +++ b/atvc/include/broadcast/utils/broadcast_buf_pool.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/broadcast/utils/broadcast_util.h b/atvc/include/broadcast/utils/broadcast_util.h index 062e4dd3de595f350f9a128514298305862faf16..bba66dfcd5e2f97a4a054c1b072c654273085478 100644 --- a/atvc/include/broadcast/utils/broadcast_util.h +++ b/atvc/include/broadcast/utils/broadcast_util.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/atvc_op_check.h b/atvc/include/common/atvc_op_check.h index 3aab4f4a7958186963dedd555a9dcd414a626d9f..a373414c39e565f026caec68aead78250c4fbfbc 100644 --- a/atvc/include/common/atvc_op_check.h +++ b/atvc/include/common/atvc_op_check.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/atvc_opdef.h b/atvc/include/common/atvc_opdef.h index 1417dee725838359ff3b6ba06a2f911bb26d3578..80effed5ecd76ba86538f1f9faa27f7a7ee76aa3 100644 --- a/atvc/include/common/atvc_opdef.h +++ b/atvc/include/common/atvc_opdef.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/compile_info.h b/atvc/include/common/compile_info.h index 8fd830b5d1505ed685c8b92ca6df2780a6d1c9a1..03aecdb403a84572e72ae5f47aae0b0f0cbd66ec 100644 --- a/atvc/include/common/compile_info.h +++ b/atvc/include/common/compile_info.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/const_def.h b/atvc/include/common/const_def.h index 55b5ac129a2d39d9cbc96c6edb2267d41c08829b..85f5d9d7947d18fcfec068faad4c91df3c171ec1 100644 --- a/atvc/include/common/const_def.h +++ b/atvc/include/common/const_def.h @@ -1,6 +1,6 @@ - /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/dtype_utils.h b/atvc/include/common/dtype_utils.h index 3decfb1d8aff7b503340ded64157d99574f67e77..2341f315c3445a99d8592c97fa3b3c49ca248002 100644 --- a/atvc/include/common/dtype_utils.h +++ b/atvc/include/common/dtype_utils.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/forward.h b/atvc/include/common/forward.h index aa7922c8380ade8557aba355db2115fee918fbb2..dcbf34beb0c52205c994e577b4d8d6d414d49d27 100644 --- a/atvc/include/common/forward.h +++ b/atvc/include/common/forward.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/index_seq.h b/atvc/include/common/index_seq.h index 1287296d66e0ba591123c7cb55267cf99f3001dc..547a2cc251b4e0f425498cde58c525695b534f5c 100644 --- a/atvc/include/common/index_seq.h +++ b/atvc/include/common/index_seq.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/kernel_check_debug.h b/atvc/include/common/kernel_check_debug.h index 6d16bc5034c6e6f5393aa1149cd032fb8562a01c..cfa817e33b7746576129cc82e543dba66346f7e7 100644 --- a/atvc/include/common/kernel_check_debug.h +++ b/atvc/include/common/kernel_check_debug.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/kernel_utils.h b/atvc/include/common/kernel_utils.h index 3db939ac33973a65322eccb7a5b0bfce4d535ee2..c639c68f6ed8535ab3f2cb55e8b356e93cde6eb9 100644 --- a/atvc/include/common/kernel_utils.h +++ b/atvc/include/common/kernel_utils.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/ops_utils_device.h b/atvc/include/common/ops_utils_device.h index 24d36bc1cddc1c2b5e9c488f5659483017553b54..97a2be180eceb17fb2f273346fe24caa449730c0 100644 --- a/atvc/include/common/ops_utils_device.h +++ b/atvc/include/common/ops_utils_device.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/ops_utils_host.h b/atvc/include/common/ops_utils_host.h index f37a9e5a7295728ff3713541d6f5e8aabd6399d6..0b2f2f1e0de5c52e9efecfc644b313a3836bf2ae 100644 --- a/atvc/include/common/ops_utils_host.h +++ b/atvc/include/common/ops_utils_host.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/platform.h b/atvc/include/common/platform.h index 7db199e4f423b3a4aa1360f7abfdeb44ee38daed..924464b27a9aa6a0bae90f8c65e1e0070643012c 100644 --- a/atvc/include/common/platform.h +++ b/atvc/include/common/platform.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/tensor_info.h b/atvc/include/common/tensor_info.h index 711c90dc97eac0f39446c07d91997eb8b804e5f8..1c5575e6a41d0b552b4dccc0346899ccac7e6699 100644 --- a/atvc/include/common/tensor_info.h +++ b/atvc/include/common/tensor_info.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/tuple.h b/atvc/include/common/tuple.h index 0bd9a9e62dc15c15f33c28b6147207c9a33cbfd8..110914ac3760354beda0a77a8eb37145d472e1be 100644 --- a/atvc/include/common/tuple.h +++ b/atvc/include/common/tuple.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/common/type_list.h b/atvc/include/common/type_list.h index 67eac948838bab1ba671d7309faf0661c4175aa0..ff43af9879889d76ec7533a2a7f114ce713e9024 100644 --- a/atvc/include/common/type_list.h +++ b/atvc/include/common/type_list.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/elewise/common/elewise_common.h b/atvc/include/elewise/common/elewise_common.h index 126ab4c8786458ca4bbcf3af7e1781a9f80b87fb..2f37468c3d6edbf628b778d2c92db98f68143cb7 100644 --- a/atvc/include/elewise/common/elewise_common.h +++ b/atvc/include/elewise/common/elewise_common.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/elewise/elewise_device.h b/atvc/include/elewise/elewise_device.h index 8901f2642d92977b00eb12285052a67e209e4bdd..5bb55b37fc3f23cb5aef833e3a033f1be55d6756 100644 --- a/atvc/include/elewise/elewise_device.h +++ b/atvc/include/elewise/elewise_device.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/elewise/elewise_host.h b/atvc/include/elewise/elewise_host.h index 9f73f09f8c69ce844244906f62b60d9c13bfa436..7d6301f0571bc86b2f33f3f34f659101fabb0a31 100644 --- a/atvc/include/elewise/elewise_host.h +++ b/atvc/include/elewise/elewise_host.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/elewise/elewise_op_template.h b/atvc/include/elewise/elewise_op_template.h index 708a560d0b4b52e495f9f560347ed273af42a991..95f81efcae03e4252765e58edf9d9298eba93e34 100644 --- a/atvc/include/elewise/elewise_op_template.h +++ b/atvc/include/elewise/elewise_op_template.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/elewise/utils/elewise_util.h b/atvc/include/elewise/utils/elewise_util.h index 0ff57fb29683f049842ee3e47584712ddbb7166f..223b9f12a16734504afca640b4aa1b8a0e65d198 100644 --- a/atvc/include/elewise/utils/elewise_util.h +++ b/atvc/include/elewise/utils/elewise_util.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/reduce/common/patterns.h b/atvc/include/reduce/common/patterns.h index 2718ed609592742317cf793db1c5c7e8a30048f4..af290ffb85564fedbad35ed1471b6351d2a297b1 100644 --- a/atvc/include/reduce/common/patterns.h +++ b/atvc/include/reduce/common/patterns.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index d85353e8f249462d1e9089931c1f6301f6c44311..aa40bbb74c1657e454a81272ce39c36be3811334 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -28,6 +29,18 @@ enum ShapeDim { DIM_BROADCAST // Broadcast axis }; +struct ReduceARParam { + uint32_t repStride = 0; + uint16_t dimA = 0; + uint16_t dimMax = 0; + uint16_t mainR = 0; + uint16_t tailR = 0; + uint64_t maskAddRNum = 0; + uint16_t loopRNum = 0; + uint16_t dtypeSize = 0; + uint16_t dimR = 0; +}; + namespace AR_PATTERN { static constexpr uint32_t A = 100; static constexpr uint32_t AR = 11; diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h index ac454343399c9e9b730f0e4476852fd3112ea306..2fc4004f4fe240a4b6078c557e41af433fe66de4 100644 --- a/atvc/include/reduce/reduce_device.h +++ b/atvc/include/reduce/reduce_device.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -24,6 +25,8 @@ #include "common/kernel_utils.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #endif // ATVC_REDUCE_DEVICE_H \ No newline at end of file diff --git a/atvc/include/reduce/reduce_host.h b/atvc/include/reduce/reduce_host.h index 2e2d5ea64ae2839f621d15bcc78c3aad43e881a1..a3e4fc9544d9df831d32f4a0de7bfec15083e423 100644 --- a/atvc/include/reduce/reduce_host.h +++ b/atvc/include/reduce/reduce_host.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/reduce/reduce_max.h b/atvc/include/reduce/reduce_max.h new file mode 100644 index 0000000000000000000000000000000000000000..886266d1cae27addca9d5721540a30795a203e6f --- /dev/null +++ b/atvc/include/reduce/reduce_max.h @@ -0,0 +1,350 @@ +/** + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MAX_H +#define ATVC_REDUCE_MAX_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceMaxCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMaxCompute { +public: + // Extract operator input description information from OpTraits + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMaxCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, true when UB alignment introduced invalid lanes. + * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, destination tensor (length == dimA) + * \param[in] src, source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, output tensor (length == dimA) + * \param[in] src, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, output tensor (length == dimA) + * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, padded length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ original R length + * \param[in] oriBurstLen, original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMax(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, logical index identifying the data-base block. + * \param[in] shape, runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Max(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, left operand (in-place result). + * \param[in] ubTensorRight, right operand (read-only). + * \param[in] calCount, number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Max(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For MAX-reduction the neutral element is -∞ or 0. + * \tparam U, scalar type identical to DataType or PromoteDataType. + * \return The padding value (-∞ or 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceMax accumulates R-axis data, the values of the supplemented elements + // are set to -∞ or 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MIN; + }else if(AscendC::IsSameType::value){ + return 0; + }else{ + return -1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMax(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMax repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMax repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_MAX_H diff --git a/atvc/include/reduce/reduce_min.h b/atvc/include/reduce/reduce_min.h new file mode 100644 index 0000000000000000000000000000000000000000..af2481faddc6aca35934d99c9adf816ab1fca331 --- /dev/null +++ b/atvc/include/reduce/reduce_min.h @@ -0,0 +1,350 @@ +/** + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MIN_H +#define ATVC_REDUCE_MIN_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceMinCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMinCompute { +public: + // Extract operator input description information from OpTraits + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMinCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, true when UB alignment introduced invalid lanes. + * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, destination tensor (length == dimA) + * \param[in] src, source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, output tensor (length == dimA) + * \param[in] src, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, output tensor (length == dimA) + * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, padded length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ original R length + * \param[in] oriBurstLen, original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMin(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, logical index identifying the data-base block. + * \param[in] shape, runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Min(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, left operand (in-place result). + * \param[in] ubTensorRight, right operand (read-only). + * \param[in] calCount, number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Min(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For MIN-reduction the neutral element is +∞ or 0. + * \tparam U, scalar type identical to DataType or PromoteDataType. + * \return The padding value (+∞ or 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceMin accumulates R-axis data, the values of the supplemented elements + // are set to +∞ or 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MAX; + }else if(AscendC::IsSameType::value){ + return (uint32_t)INT32_MAX - (uint32_t)INT32_MIN; + }else{ + return 1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMin(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMin repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMin repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_MIN_H diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h index 37a19be9dd94e9b1900e89cc1ec0bb4656f7666e..ded068f0a0486cdf1129d303feadf241b9272ae0 100644 --- a/atvc/include/reduce/reduce_op_template.h +++ b/atvc/include/reduce/reduce_op_template.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -148,7 +149,9 @@ public: template __aicore__ inline void AllocTensorAux(AscendC::LocalTensor& tensor) { - bufPool_.AllocTensor(tensor); + T DupValue = needDup ? compute_.template GetPaddingValue() : 0; + bufPool_.AllocTensor(tensor, DupValue); + // bufPool_.AllocTensor(tensor); } /*! diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h index b94ebad57be1d72b7b8a75ce93af0c218d9e9640..2ef9024ad64f267e5cdc532b8acecb167ddbc847 100644 --- a/atvc/include/reduce/reduce_sum.h +++ b/atvc/include/reduce/reduce_sum.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -14,20 +15,7 @@ #include "common/kernel_utils.h" #include "reduce/common/patterns.h" #include "reduce/utils/reduce_block_aux_util.h" - -namespace { -struct ReduceARParam { - uint32_t repStride = 0; - uint16_t dimA = 0; - uint16_t dimMax = 0; - uint16_t mainR = 0; - uint16_t tailR = 0; - uint64_t maskAddRNum = 0; - uint16_t loopRNum = 0; - uint16_t dtypeSize = 0; - uint16_t dimR = 0; -}; -} +#include "reduce/common/reduce_common.h" namespace ATVC { /*! diff --git a/atvc/include/reduce/tiling/reduce_tiling.h b/atvc/include/reduce/tiling/reduce_tiling.h index 0ff53d130fcd84877fbf8bb92da5e2fa8fb4c249..98dfdea75f245c621b9c9881a16d349fb888d1b8 100644 --- a/atvc/include/reduce/tiling/reduce_tiling.h +++ b/atvc/include/reduce/tiling/reduce_tiling.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/reduce/tiling/tiling_common.h b/atvc/include/reduce/tiling/tiling_common.h index a0d55d690f4a7b89c5fa67a90f6aa0ddf5c7f2bc..e5bd5dde1f8604217ef710d9c22ddc8b22c77a14 100644 --- a/atvc/include/reduce/tiling/tiling_common.h +++ b/atvc/include/reduce/tiling/tiling_common.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h index 2fa2359c22fa90e2d5a7b1f0df42f7e2bdb073d0..c86ddfdbc3cc3789bfce89d25e6b6d8701f85666 100644 --- a/atvc/include/reduce/utils/reduce_block_aux.h +++ b/atvc/include/reduce/utils/reduce_block_aux.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -235,7 +236,7 @@ public: computeTensor = ubTensor; } else { // The index of AlloccomputeTensorAux does not require external perception - op_->ReduceOp::template AllocTensorAux(computeTensor); + op_->ReduceOp::template AllocTensorAux(computeTensor); CopyIn(view, shape, ubTensor); SetEvent(AscendC::HardEvent::MTE2_V); AscendC::Cast(computeTensor, ubTensor, AscendC::RoundMode::CAST_NONE, shape.value[0] * shape.value[1]); @@ -247,14 +248,21 @@ public: __aicore__ inline void LinearComputeR(int64_t& tmpBufOffest, V& shape, Args... args) { SliceView view; + bool needDup = false; for (int64_t i = 0; i < bisectionTail; i++) { AscendC::LocalTensor tensorLeft; - op_->ReduceOp::template AllocTensorAux(tensorLeft); + op_->ReduceOp::template AllocTensorAux(tensorLeft); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA), false>(i, view, shape, tensorLeft, computeLeft); AscendC::LocalTensor tensorRight; - op_->ReduceOp::template AllocTensorAux(tensorRight); + needDup = i == bisectionTail - 1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensorRight); + }else{ + op_->ReduceOp::template AllocTensorAux(tensorRight); + } + // op_->ReduceOp::template AllocTensorAux(tensorRight); AscendC::LocalTensor computeRight; PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight); ComputeMerge(shape, computeLeft, computeRight, args...); @@ -267,7 +275,13 @@ public: for (int64_t i = bisectionTail; i < bisectionPos; i++) { AscendC::LocalTensor tensor; - op_->ReduceOp::template AllocTensorAux(tensor); + needDup = i == bisectionPos -1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensor); + }else{ + op_->ReduceOp::template AllocTensorAux(tensor); + } + // op_->ReduceOp::template AllocTensorAux(tensor); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA && Pattern::Dim > 2), false>(i, view, shape, tensor, computeLeft); Compute(shape, computeLeft, args...); diff --git a/atvc/include/reduce/utils/reduce_block_aux_util.h b/atvc/include/reduce/utils/reduce_block_aux_util.h index e0c625e79c7128358e958d64b417b5da3c678af8..83f0321f87da722a25979ac801d5dd4459aade71 100644 --- a/atvc/include/reduce/utils/reduce_block_aux_util.h +++ b/atvc/include/reduce/utils/reduce_block_aux_util.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. diff --git a/atvc/include/reduce/utils/reduce_buf_pool.h b/atvc/include/reduce/utils/reduce_buf_pool.h index 26d2ef4ab8f991d1276db8da26090cd66bc860be..e4e49910a6f532c4a02d506cb6157cdae865daab 100644 --- a/atvc/include/reduce/utils/reduce_buf_pool.h +++ b/atvc/include/reduce/utils/reduce_buf_pool.h @@ -1,5 +1,6 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -62,7 +63,10 @@ public: // Init buffer GetTPipePtr()->InitBuffer(qQue_, poolSize); AscendC::LocalTensor inputUb = qQue_.GetWithOffset(basicNum_ * inputNum, 0); - AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + // AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + for(int16_t i =0;i - __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor) { + __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor, T DupValue) { if constexpr (temp){ if constexpr (IsInput) { int32_t idx = GetPreTensorId(); @@ -96,11 +100,16 @@ public: int32_t idx = GetInputTensorId(); tensor = qQue_.GetWithOffset(basicNum_, inputUnit_.offset + idx * basicNum_ * sizeof(T)); if constexpr (needDup) { - AscendC::Duplicate(tensor, 0, basicNum_); + AscendC::PipeBarrier(); + AscendC::Duplicate(tensor, DupValue, basicNum_); + } + if(usedTBuf_[idx] || needDup){ event_t allocEventId = static_cast(GetTPipePtr()->FetchEventID()); eventIdV2Mte2_[idx] = allocEventId; + needWaitFlag_[idx] = true; AscendC::SetFlag(allocEventId); } + usedTBuf_[idx] = true; } else { int32_t idx = GetComputeTensorId(); tensor = qQue_.GetWithOffset(basicNum_, computeUnit_.offset + idx * basicNum_ * sizeof(T)); @@ -119,7 +128,11 @@ public: uint64_t offset = (uint64_t)(tensor.GetPhyAddr()); if (offset - start < computeUnit_.offset) { int32_t idx = (offset - start) / sizeof(T) / basicNum_; - AscendC::WaitFlag(eventIdV2Mte2_[idx]); + // AscendC::WaitFlag(eventIdV2Mte2_[idx]); + if(needWaitFlag_[idx]){ + AscendC::WaitFlag(eventIdV2Mte2_[idx]); + needWaitFlag_[idx] = false; + } } } @@ -153,6 +166,8 @@ private: PoolManagerUnit computeUnit_; PoolManagerUnit postUnit_; event_t eventIdV2Mte2_[MAX_INPUT_SIZE]; + bool needWaitFlag_[MAX_INPUT_SIZE]; + bool usedTBuf_[MAX_INPUT_SIZE]; AscendC::TBuf<> qQue_; int32_t basicNum_; }; // class ReduceBufPool diff --git a/atvc/include/reduce/utils/reduce_util.h b/atvc/include/reduce/utils/reduce_util.h index f0458bc220a08be289abe39d5cd46a9a27fa203d..7405037b02b86bc644d208de7228b68cfe26b8c9 100644 --- a/atvc/include/reduce/utils/reduce_util.h +++ b/atvc/include/reduce/utils/reduce_util.h @@ -1,10 +1,11 @@ /** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO + * NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ diff --git a/cmake/scripts/gen_kernel_tiling_data_def.py b/cmake/scripts/gen_kernel_tiling_data_def.py index 22394286241ddb8ffcf9efdb19691739372f9a16..c8e08618ba74b0f1e8738196aaeba3b29c94a5d3 100644 --- a/cmake/scripts/gen_kernel_tiling_data_def.py +++ b/cmake/scripts/gen_kernel_tiling_data_def.py @@ -15,13 +15,52 @@ import os import re +_NAMESPACE = "AscendC::tiling" +_LEGACY_TILING_STRUCTS = [ + "LogSoftMaxTiling", + "SoftMaxTiling", + "TConv3DApiTiling", + "TConv3DBpFilterTiling", + "Conv3DBpFilterParams", + "TConv3DBpFilterBasicBlockTiling", + "Conv3DBackpropFilterTilingData", + "TConv3DBackpropInputTiling", + "Conv3DBackpropInputTilingData", + "Mc2ServerCfg", + "Mc2HcommCfg", + "Mc2InitTiling", + "Mc2CcTiling", + "TCubeTiling", + "BatchNormTiling", + "DeepNormTiling", + "GroupNormTiling", + "LayerNormGradBetaTiling", + "LayerNormGradTiling", + "LayerNormTiling", + "LayerNormSeparateTiling", + "RmsNormTiling", + "UnPadTiling", + "PadTiling", + "TopkTiling", + "ConfusionTransposeTiling" +] + + def gen_tiling(tiling_header_file): single_tiling_source = "" + single_legacy_tiling_export = "" if not os.path.exists(tiling_header_file): print("warning: no userdef tiling header file: ", tiling_header_file) return single_tiling_source print("generate tiling def header file: ", tiling_header_file) pattern = re.compile(r'[(](.*)[)]', re.S) + + def parse_legacy_tiling(struct_def): + # export legacy tiling structs with 'using namespace' to ensure compatibility + nonlocal single_legacy_tiling_export + if struct_def in _LEGACY_TILING_STRUCTS: + single_legacy_tiling_export += f"using {_NAMESPACE}::{struct_def};\n" + with open(tiling_header_file, 'r') as fd: lines = fd.readlines() for line in lines: @@ -29,8 +68,9 @@ def gen_tiling(tiling_header_file): if (line.startswith('BEGIN_TILING_DATA_DEF')): single_tiling_source += '#pragma pack(push, 8)\n' single_tiling_source += 'struct ' - struct_def = re.findall(pattern, line)[0] + struct_def = re.findall(pattern, line)[0] single_tiling_source += struct_def + ' {\n' + parse_legacy_tiling(struct_def) elif (line.startswith('TILING_DATA_FIELD_DEF_ARR')): field_params = re.findall(pattern, line)[0] fds = field_params.split(',') @@ -46,7 +86,7 @@ def gen_tiling(tiling_header_file): elif (line.startswith('END_TILING_DATA_DEF')): single_tiling_source += '};\n' single_tiling_source += '#pragma pack(pop)\n' - return single_tiling_source + return single_tiling_source, single_legacy_tiling_export @@ -62,6 +102,7 @@ if __name__ == '__main__': #endif """ + res += "namespace AscendC {\nnamespace tiling {\n" print("[LOG]: ", sys.argv[1], sys.argv[2], sys.argv[3]) src_tiling_data_path = sys.argv[1] file_list = [] @@ -79,8 +120,17 @@ if __name__ == '__main__': if file.endswith("tilingdata.h") and file not in file_set: file_list.append(os.path.join(root, file)) file_list.sort() + + tiling_source = "" + legacy_tiling_export = "" for file in file_list: - res += gen_tiling(file) + src, exp = gen_tiling(file) + tiling_source += src + legacy_tiling_export += exp + + res += tiling_source + "} // namespace tiling\n} // namespace AscendC\n\n" + res += legacy_tiling_export + res += '#endif\n' generate_file = sys.argv[3] diff --git a/impl/activation/softmax/membase/common/simple_softmax_common_impl.h b/impl/activation/softmax/membase/common/simple_softmax_common_impl.h index 3c7507a6c9fcb8cf2ec32e33254b13227e5c7260..2d9b00fceb135d01774d604039acff0d76770885 100644 --- a/impl/activation/softmax/membase/common/simple_softmax_common_impl.h +++ b/impl/activation/softmax/membase/common/simple_softmax_common_impl.h @@ -308,17 +308,11 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, PipeBarrier(); Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } else { - uint32_t splitK = 0; - if constexpr (config.oriSrcK % HALF_NUM_PER_BLK == 0) { - splitK = config.oriSrcK; - } else { - splitK = AlignUp(config.oriSrcK, HALF_NUM_PER_BLK); - } Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); PipeBarrier(); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, splitK, + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE * HALF_FACTOR); PipeBarrier(); @@ -326,7 +320,7 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, splitK, + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE * HALF_FACTOR); PipeBarrier(); @@ -348,18 +342,12 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, PipeBarrier(); GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); } else { - uint32_t splitK = 0; - if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { - splitK = config.oriSrcK; - } else { - splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); - } - GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, splitK, + GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); PipeBarrier(); Exp(dst[offset1], dst[offset1], splitSize); PipeBarrier(); - GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, splitK, + GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); } } @@ -461,20 +449,14 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, PipeBarrier(); Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } else { - uint32_t splitK = 0; - if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { - splitK = config.oriSrcK; - } else { - splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); - } Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); PipeBarrier(); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, splitK, + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); PipeBarrier(); Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize); PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, splitK, + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); PipeBarrier(); Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); diff --git a/impl/math/exp/exp_common_impl.h b/impl/math/exp/exp_common_impl.h index 6b4b16f6c072853e147f43b248bc02b280919867..69d7a98855c5917615eb22431c16a326e5c0383a 100644 --- a/impl/math/exp/exp_common_impl.h +++ b/impl/math/exp/exp_common_impl.h @@ -166,7 +166,7 @@ __aicore__ inline void ExpHighPrecisionExec(const ExpParams& params, uint PipeBarrier(); // FloorXPow: (x ^ (n+1)) / n! * [1 / (n+1)] -> (x ^ (n+1)) / (n+1)! - Muls(params.tempTensorFloorXPow, params.tempTensorFloorXPow, float(1.0) / float(i), + Muls(params.tempTensorFloorXPow, params.tempTensorFloorXPow, static_cast(1.0) / static_cast(i), MASK_PLACEHOLDER, 1, unaryParams); PipeBarrier(); diff --git a/impl/matmul/kfc/matmul_server.h b/impl/matmul/kfc/matmul_server.h index 3745d9dfc0d1c3459fa2f9b0cc36f23980463015..d6469de58e91245ed15e1f95d90dcd7566522411 100644 --- a/impl/matmul/kfc/matmul_server.h +++ b/impl/matmul/kfc/matmul_server.h @@ -17,6 +17,7 @@ #include "matmul_server_utils.h" #include "../utils/matmul_config_utils.h" +#include "../utils/matmul_utils.h" namespace AscendC { @@ -131,56 +132,6 @@ public: mul.SetBatchNum(body->batchA, body->batchB); } - __aicore__ inline void CalcBatchoffset(uint32_t batchA, uint32_t batchB, uint32_t batchC, BmmOffset& batchOffset) - { - if constexpr (A_TYPE::layout == LayoutMode::BNGS1S2 || A_TYPE::layout == LayoutMode::NORMAL) { - batchOffset.offA = tiling_.GetALayoutInfoD() * tiling_.GetALayoutInfoS() * batchA * sizeof(typename A_TYPE::T); - } else if constexpr (A_TYPE::layout == LayoutMode::SBNGD) { - batchOffset.offA = tiling_.GetALayoutInfoD() * batchA * sizeof(typename A_TYPE::T); - } - if constexpr (B_TYPE::layout == LayoutMode::BNGS1S2 || B_TYPE::layout == LayoutMode::NORMAL) { - batchOffset.offB = tiling_.GetBLayoutInfoD() * tiling_.GetBLayoutInfoS() * batchB * sizeof(typename B_TYPE::T); - } else { - batchOffset.offB = tiling_.GetBLayoutInfoD() * batchB * sizeof(typename B_TYPE::T); - } - - if constexpr (C_TYPE::layout == LayoutMode::BNGS1S2 || C_TYPE::layout == LayoutMode::NORMAL) { - batchOffset.offC = tiling_.GetCLayoutInfoS2() * tiling_.GetCLayoutInfoS1() * batchC * sizeof(typename C_TYPE::T); - } else { - batchOffset.offC = tiling_.GetCLayoutInfoS2() * batchC * sizeof(typename C_TYPE::T); - } - } - - __aicore__ inline void CalcNBatchoffset(uint32_t batchA, uint32_t batchB, uint32_t batchC, uint32_t loopIdx, BmmOffset batchOffset, BmmOffset& batchLoopOffset) - { - if constexpr (A_TYPE::layout == LayoutMode::BSNGD) { - uint64_t aLayoutBIdx = loopIdx * batchA / (tiling_.GetALayoutInfoN() * tiling_.GetALayoutInfoG()); - uint64_t aLayoutNGOff = loopIdx * batchA % (tiling_.GetALayoutInfoN() * tiling_.GetALayoutInfoG()); - batchLoopOffset.offA = (aLayoutBIdx * tiling_.GetALayoutInfoD() * tiling_.GetALayoutInfoS() * tiling_.GetALayoutInfoN() * tiling_.GetALayoutInfoG() + - aLayoutNGOff * tiling_.GetALayoutInfoD()) * sizeof(typename A_TYPE::T); - } else { - batchLoopOffset.offA = batchOffset.offA * loopIdx; - } - - if constexpr (B_TYPE::layout == LayoutMode::BSNGD) { - uint64_t bLayoutBIdx = loopIdx * batchB / (tiling_.GetBLayoutInfoN() * tiling_.GetBLayoutInfoG()); - uint64_t bLayoutNGOff = loopIdx * batchB % (tiling_.GetBLayoutInfoN() * tiling_.GetBLayoutInfoG()); - batchLoopOffset.offB = (bLayoutBIdx * tiling_.GetBLayoutInfoD() * tiling_.GetBLayoutInfoS() * tiling_.GetBLayoutInfoN() * tiling_.GetBLayoutInfoG() + - bLayoutNGOff * tiling_.GetBLayoutInfoD()) * sizeof(typename B_TYPE::T); - } else { - batchLoopOffset.offB = batchOffset.offB * loopIdx; - } - - if constexpr (C_TYPE::layout == LayoutMode::BSNGD) { - uint64_t cLayoutBIdx = loopIdx * batchC / (tiling_.GetCLayoutInfoN() * tiling_.GetCLayoutInfoG()); - uint64_t cLayoutNGOff = loopIdx * batchC % (tiling_.GetCLayoutInfoN() * tiling_.GetCLayoutInfoG()); - batchLoopOffset.offC = (cLayoutBIdx * tiling_.GetCLayoutInfoS2() * tiling_.GetCLayoutInfoS1() * tiling_.GetCLayoutInfoN() * tiling_.GetCLayoutInfoG() + - cLayoutNGOff * tiling_.GetCLayoutInfoS2()) * sizeof(typename C_TYPE::T); - } else { - batchLoopOffset.offC = batchOffset.offC * loopIdx; - } - } - #if defined(__DAV_C310__) __aicore__ inline void SetUserDefInfo(MSG_POS KfcMsg* msg) { @@ -310,6 +261,7 @@ public: } } } + __aicore__ inline bool IterateBatch(MSG_POS KfcMsg* msg); __aicore__ inline void StartIterateNBatch(MsgTmpPos MatmulConfigParams* body, uint32_t &cntIterator); __aicore__ inline bool IterateNBatch(MSG_POS KfcMsg* msg); diff --git a/impl/matmul/kfc/matmul_server_aux.h b/impl/matmul/kfc/matmul_server_aux.h index 3b220c918096a75b60ca34595eef96cbe303cc89..66bf5ac5b476c721084d87124b26f9b192fa5771 100644 --- a/impl/matmul/kfc/matmul_server_aux.h +++ b/impl/matmul/kfc/matmul_server_aux.h @@ -248,6 +248,14 @@ public: "Iterate not support when enableMixDualMaster is enabled"); return false; }; + + template __aicore__ inline bool Iterate(bool enPartialSum, + const LocalTensor& localCmatrix) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "Iterate not support when enableMixDualMaster is enabled"); + return false; + }; template __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) @@ -328,7 +336,7 @@ public: template __aicore__ inline void IterateBatch(const GlobalTensor& gm, uint32_t batchA, uint32_t batchB, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, - const uint32_t matrixStrideC = 0) + const uint32_t matrixStrideC = 0, const bool enPartialSum = false, const uint8_t enAtomic = 0) { ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && "IterateBatch not support when enableMixDualMaster is enabled"); @@ -336,11 +344,28 @@ public: template __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, uint32_t batchA, uint32_t batchB, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, - const uint32_t matrixStrideC = 0) + const uint32_t matrixStrideC = 0, const bool enPartialSum = false, const uint8_t enAtomic = 0) { ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && "IterateBatch not support when enableMixDualMaster is enabled"); } + + __aicore__ inline void IterateBatch(const GlobalTensor& gm, + bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0, + const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "IterateBatch not support when enableMixDualMaster is enabled"); + } + + __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, + bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0, + const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "IterateBatch not support when enableMixDualMaster is enabled"); + } + template __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, diff --git a/impl/matmul/kfc/matmul_server_impl.h b/impl/matmul/kfc/matmul_server_impl.h index b1a3ee2b3f9db89dbdefd79198a89c4a6d6a6383..f0d2d16fd98ef48dd78e1df841c7bfb8de71da31 100644 --- a/impl/matmul/kfc/matmul_server_impl.h +++ b/impl/matmul/kfc/matmul_server_impl.h @@ -343,13 +343,13 @@ __aicore__ inline void MatmulServicebatchA, body->batchB, batchC, batchOffset); - BmmOffset batchLoopOffset; for (uint32_t loopIdx = 0U; loopIdx < body->batchLoop; loopIdx++) { const uint64_t biasOffset = batchOffsetBias * loopIdx; - CalcNBatchoffset(body->batchA, body->batchB, batchC, loopIdx, batchOffset, batchLoopOffset); + batchLoopOffset.offA = CalcNBatchoffset(body->batchA, loopIdx, tiling_.GetALayoutInfoN(), tiling_.GetALayoutInfoG(), tiling_.GetALayoutInfoD(), tiling_.GetALayoutInfoS()); + batchLoopOffset.offB = CalcNBatchoffset(body->batchB, loopIdx, tiling_.GetBLayoutInfoN(), tiling_.GetBLayoutInfoG(), tiling_.GetBLayoutInfoD(), tiling_.GetBLayoutInfoS()); + batchLoopOffset.offC = CalcNBatchoffset(batchC, loopIdx, tiling_.GetCLayoutInfoN(), tiling_.GetCLayoutInfoG(), tiling_.GetCLayoutInfoS2(), tiling_.GetCLayoutInfoS1()); + IterateSetMessage(body, singleBatchASize, singleBatchBSize, batchLoopOffset.offA, batchLoopOffset.offB, biasOffset); GlobalTensor cGlobal; cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(body->cAddr + batchLoopOffset.offC), size); diff --git a/impl/matmul/kfc/matmul_server_impl_c310.h b/impl/matmul/kfc/matmul_server_impl_c310.h index a5f7054e60331512c3d56275c65d035725f117d6..a0042d3c668a243f310fefde9b4ab8d9c0eccd59 100644 --- a/impl/matmul/kfc/matmul_server_impl_c310.h +++ b/impl/matmul/kfc/matmul_server_impl_c310.h @@ -190,22 +190,15 @@ __aicore__ inline void MatmulServicebatchA > body->batchB ? body->batchA : body->batchB; - bool layoutGCondition = tiling_.GetCLayoutInfoG() == 1 && - (tiling_.GetBLayoutInfoG() != 1 || tiling_.GetALayoutInfoG() != 1); - if (layoutGCondition) { - int32_t layoutG = tiling_.GetBLayoutInfoG() > tiling_.GetALayoutInfoG() ? tiling_.GetBLayoutInfoG() : tiling_.GetALayoutInfoG(); - batchC = batchC / layoutG; - } + uint32_t batchC = GetBatchCNum(body->batchA, body->batchB, tiling_.GetALayoutInfoG(), tiling_.GetBLayoutInfoG(), tiling_.GetCLayoutInfoG()); uint64_t batchOffsetBias = tiling_.GetCLayoutInfoS2() * batchC * sizeof(typename BIAS_TYPE::T); - - BmmOffset batchOffset; - CalcBatchoffset(body->batchA, body->batchB, batchC, batchOffset); - BmmOffset batchLoopOffset; for (uint32_t loopIdx = 0U; loopIdx < body->batchLoop; loopIdx++) { const uint64_t biasOffset = batchOffsetBias * loopIdx; - CalcNBatchoffset(body->batchA, body->batchB, batchC, loopIdx, batchOffset, batchLoopOffset); + batchLoopOffset.offA = CalcNBatchoffset(body->batchA, loopIdx, tiling_.GetALayoutInfoN(), tiling_.GetALayoutInfoG(), tiling_.GetALayoutInfoD(), tiling_.GetALayoutInfoS()); + batchLoopOffset.offB = CalcNBatchoffset(body->batchB, loopIdx, tiling_.GetBLayoutInfoN(), tiling_.GetBLayoutInfoG(), tiling_.GetBLayoutInfoD(), tiling_.GetBLayoutInfoS()); + batchLoopOffset.offC = CalcNBatchoffset(batchC, loopIdx, tiling_.GetCLayoutInfoN(), tiling_.GetCLayoutInfoG(), tiling_.GetCLayoutInfoS2(), tiling_.GetCLayoutInfoS1()); + IterateSetMessage(body, singleBatchASize, singleBatchBSize, batchLoopOffset.offA, batchLoopOffset.offB, biasOffset); GlobalTensor cGlobal; cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(body->cAddr + batchLoopOffset.offC), size); diff --git a/impl/matmul/kfc/matmul_server_utils.h b/impl/matmul/kfc/matmul_server_utils.h index fe0000b701208a1579bb12ea32bb4dfe26791d52..cdd9a9671221ccc6e19988e3ec9891c856bc7730 100644 --- a/impl/matmul/kfc/matmul_server_utils.h +++ b/impl/matmul/kfc/matmul_server_utils.h @@ -55,12 +55,6 @@ struct MatmulMsg { uint32_t orgKc; }; -struct BmmOffset { - uint64_t offA; - uint64_t offB; - uint64_t offC; -}; - struct ShareMatmulBase { __aicore__ inline ShareMatmulBase() {}; }; diff --git a/impl/matmul/scheduler/batch/batch_scheduler.h b/impl/matmul/scheduler/batch/batch_scheduler.h index 32822d167b3cd564694e8902fa3d144c364363c5..993b5c26bd9be3e0e2bb1994799b6a22c9443602 100644 --- a/impl/matmul/scheduler/batch/batch_scheduler.h +++ b/impl/matmul/scheduler/batch/batch_scheduler.h @@ -93,9 +93,12 @@ public: event_t eventIDMToMte1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::M_MTE1)); auto batchLoop = MATMUL_MODULE(BatchLoop); for (batchLoop->SplitStart(); !batchLoop->SplitEnd(); batchLoop->SplitNext()) { - MATMUL_MODULE(BatchCopyCubeInA)->BatchLoad(a1, matrixStrideA, batchLoop->GetOuterIndex(), + uint32_t outerIdxA; + uint32_t outerIdxB; + batchLoop->CalcBatchOuterIdx(outerIdxA, outerIdxB); + MATMUL_MODULE(BatchCopyCubeInA)->BatchLoad(a1, matrixStrideA, outerIdxA, batchLoop->GetSplitIndex(), batchLoop->GetSplitSize()); - MATMUL_MODULE(BatchCopyCubeInB)->BatchLoad(b1, matrixStrideB, batchLoop->GetOuterIndex(), + MATMUL_MODULE(BatchCopyCubeInB)->BatchLoad(b1, matrixStrideB, outerIdxB, batchLoop->GetSplitIndex(), batchLoop->GetSplitSize()); SetFlag(eventIDMte2ToMte1); WaitFlag(eventIDMte2ToMte1); @@ -293,6 +296,7 @@ private: ComputeMDb(a1, b1, bias, ctx, sL0CInit, sL0CLast, enPartialSum); } else { ComputeNDb(a1, b1, bias, ctx, sL0CInit, sL0CLast, enPartialSum); + MATMUL_MODULE(BiasScheduler)->Free(); } } while(MATMUL_MODULE(KLoop)->OuterNext()); } @@ -336,7 +340,6 @@ private: cmatrixInitVal, false); bufferPool.Free(); - MATMUL_MODULE(BiasScheduler)->Free(); axisL1DbOffset += MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); } } diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop.h index 1dc55f7fecf8a46358387b99426c96b53151b582..001ae22d4f712019ba34c06c067865db3d815941 100644 --- a/impl/matmul/scheduler/iterator/batch_loop/batch_loop.h +++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop.h @@ -14,7 +14,8 @@ #ifndef IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_H #define IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_H -#include "batch_loop_multi.h" #include "batch_loop_single.h" +#include "batch_loop_batch_less.h" +#include "batch_loop_batch_large.h" #endif // _BATCH_LOOP_H_ \ No newline at end of file diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_base.h similarity index 58% rename from impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h rename to impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_base.h index 72d8bfdf680d906fb8243ae0dcd435e15dcf0667..ca00011443382e8a89be5037af1a46191a5fe3c7 100644 --- a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h +++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_base.h @@ -1,7 +1,8 @@ /** + * This program is free software, you can redistribute it and/or modify it. * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. @@ -9,28 +10,26 @@ */ /*! - * \file batch_loop_multi.h + * \file file batch_loop_batch_base.h * \brief */ -#ifndef IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_MULTI_H -#define IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_MULTI_H + #ifndef IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_BASE_H + #define IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_BASE_H -#include "batch_loop_intf.h" +#include "../../../utils/matmul_module.h" namespace AscendC { namespace Impl { namespace Detail { /* - BatchLoop is considered entirely experimental. + BatchLoopBase is considered entirely experimental. We retain the freedom to make incompatible changes, but do not guarantee the stability. - BatchLoop is only for internal usage, does not support extension or customized specialization! + BatchLoopBase is only for internal usage, does not support extension or customized specialization! */ template -class BatchLoop() == Impl::Detail::CopyCubeInType::BMM) || - (Impl::Detail::IsBMMFromL1())>> +class BatchLoopBase { MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(MatmulShapeInfo); @@ -38,27 +37,26 @@ class BatchLoopGetTiling(); - CalcBatchNum(tiling.GetALayoutInfoB(), tiling.GetBLayoutInfoB(), tiling.GetBatchNum(), tiling.GetBatchNum()); - if constexpr (IsBmmDoubleBuffer()) { auto batchNum = tiling.GetBatchNum(); splitSize_ = (batchNum % DB_FACTOR == 0) ? DB_FACTOR : 1; splitBatchNum_ = batchNum / splitSize_; } - UpdateBatchNumParams(); + CalcBatchNum(tiling.GetBatchNum(), tiling.GetBatchNum()); + batchNum_ = batchA_ > batchB_ ? batchA_ : batchB_; } __aicore__ inline void SetBatchNum(int32_t batchNumA, int32_t batchNumB) { - CalcBatchNum(batchNumA, batchNumB, batchNumA, batchNumB); - UpdateBatchNumParams(); + CalcBatchNum(batchNumA, batchNumB); + batchNum_ = batchA_ > batchB_ ? batchA_ : batchB_; } __aicore__ inline void SetNBatchOutNum(int32_t nBatchOutNum) @@ -79,16 +77,6 @@ public: { outerIdx_++; dstOffset_ += batchCalcSize_; - if (oddAndLargeThanL1_ && outerIdx_ == batchOuter_ - 1) { - const int32_t tail = inputBatchNum_ % batchA_; - batchA_ = tail == 0 ? mainBatchInner_ : tail; - batchB_ = batchA_; - batchNum_ = batchA_; - batchCalcSize_ = batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() * - MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); - splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; - splitBatchNum_ = batchNum_ / splitSize_; - } } __aicore__ inline bool OuterEnd() @@ -96,16 +84,6 @@ public: return outerIdx_ >= batchOuter_; } - __aicore__ inline int32_t GetMainBatchBlockA() const - { - return oddAndLargeThanL1_ ? mainBatchInner_ : batchA_; // batchNum main block in outLoop - } - - __aicore__ inline int32_t GetMainBatchBlockB() const - { - return oddAndLargeThanL1_ ? mainBatchInner_ : batchB_; // batchNum main block in outLoop - } - __aicore__ inline uint32_t GetOuterIndex() const { return outerIdx_; @@ -147,9 +125,19 @@ public: return batchB_; } + __aicore__ inline int32_t GetMainBatchBlockA() const + { + return batchA_; // batchNum main block in outLoop + } + + __aicore__ inline int32_t GetMainBatchBlockB() const + { + return batchB_; // batchNum main block in outLoop + } + __aicore__ inline int32_t GetBiasBatchSrcOffset() const { - return outerIdx_ * (oddAndLargeThanL1_ ? mainBatchInner_ : batchNum_) * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + return outerIdx_ * batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); } // Double Buffer Loop @@ -221,18 +209,10 @@ public: __aicore__ inline bool InnerEnd() { - if ((!oddAndLargeThanL1_) || (batchNum_ % DB_FACTOR == 0) || (splitSize_ < DB_FACTOR)) { - if constexpr (IsBmmDoubleBuffer()) { - return (innerIdx_ >= splitBatchNum_) || (splitOuterIdx_ * splitBatchNum_ >= batchNum_) || (innerBatchIdx_ >= batchNum_); - } else { - return (innerIdx_ >= splitBatchNum_) || (splitOuterIdx_ * splitBatchNum_ >= batchNum_); - } - } - const auto firstBatchNum = batchNum_ / splitSize_; - if (splitOuterIdx_ < 1) { - return innerIdx_ >= firstBatchNum; + if constexpr (IsBmmDoubleBuffer()) { + return (innerIdx_ >= splitBatchNum_) || (splitOuterIdx_ * splitBatchNum_ >= batchNum_) || (innerBatchIdx_ >= batchNum_); } else { - return innerIdx_ >= batchNum_ - firstBatchNum; + return (innerIdx_ >= splitBatchNum_) || (splitOuterIdx_ * splitBatchNum_ >= batchNum_); } } @@ -288,49 +268,27 @@ public: } private: - __aicore__ inline void CalcBatchNum(int32_t layoutBatchNumA, int32_t layoutBatchNumB, - int32_t batchNumA, int32_t batchNumB) + __aicore__ inline void CalcBatchNum(int32_t batchNumA, int32_t batchNumB) { totalBatchNum_ = batchNumA > batchNumB ? batchNumA : batchNumB; - if constexpr (ToMatmulConfig(MM_CFG).batchMode != BatchMode::BATCH_LARGE_THAN_L1) { - ASSERT(batchNumA > 0 && batchNumB > 0 && - (batchNumA % batchNumB == 0 || batchNumB % batchNumA == 0)); - batchA_ = batchNumA; - batchB_ = batchNumB; - mainBatchInner_ = 0; - return; - } + ASSERT(batchNumA > 0 && batchNumB > 0 && + (batchNumA % batchNumB == 0 || batchNumB % batchNumA == 0)); + batchA_ = batchNumA; + batchB_ = batchNumB; + } - ASSERT(layoutBatchNumA > 0 && layoutBatchNumB > 0 && - (layoutBatchNumA % layoutBatchNumB == 0 || layoutBatchNumB % layoutBatchNumA == 0)); - int32_t aMatrixSingleBatchSize = GetSingleSizeAlignA(); - int32_t bMatrixSingleBatchSize = GetSingleSizeAlignB(); - if ((layoutBatchNumA * aMatrixSingleBatchSize + layoutBatchNumB * bMatrixSingleBatchSize + - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias() * - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN() * sizeof(BiasT)) <= TOTAL_L1_SIZE) { - batchOuter_ = 1; - batchA_ = layoutBatchNumA; - batchB_ = layoutBatchNumB; - return; - } - int32_t batchNumLarge; - int32_t batchNumLess; - int32_t largeMatrixSingleBatchSize; - int32_t lessMatrixSingleBatchSize; - if (layoutBatchNumA >= layoutBatchNumB) { - batchNumLarge = layoutBatchNumA; - batchNumLess = layoutBatchNumB; - largeMatrixSingleBatchSize = aMatrixSingleBatchSize; - lessMatrixSingleBatchSize = bMatrixSingleBatchSize; - } else { - batchNumLarge = layoutBatchNumB; - batchNumLess = layoutBatchNumA; - largeMatrixSingleBatchSize = bMatrixSingleBatchSize; - lessMatrixSingleBatchSize = aMatrixSingleBatchSize; - } - CalcBatchAB(batchNumLarge, batchNumLess, largeMatrixSingleBatchSize, lessMatrixSingleBatchSize, layoutBatchNumA >= layoutBatchNumB); + __aicore__ inline void UpdateSplitParams() + { + splitBatchIdx_ += splitBatchNum_; } + __aicore__ inline void UpdateInnerParams() + { + innerBatchIdx_ = innerIdx_ + splitBatchIdx_; + } + +protected: + __aicore__ inline int32_t GetSingleSizeAlignA() { const auto matmulShapeInfo = MATMUL_MODULE(MatmulShapeInfo); @@ -365,65 +323,6 @@ private: } } - __aicore__ inline void CalcBatchAB(int32_t batchNumLarge, int32_t batchNumLess, - int32_t largeMatrixSingleBatchSize, int32_t lessMatrixSingleBatchSize, bool isBatchALarger) - { - int32_t multiples = batchNumLarge / batchNumLess; - int32_t singleBatchSize = multiples * largeMatrixSingleBatchSize + lessMatrixSingleBatchSize + - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias() * - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN() * sizeof(BiasT); - int32_t batchInner = TOTAL_L1_SIZE / singleBatchSize; - inputBatchNum_ = batchNumLarge; - - ASSERT(batchInner > 0); - oddAndLargeThanL1_ = (multiples == 1) && (inputBatchNum_ % DB_FACTOR != 0); - if (oddAndLargeThanL1_) { - mainBatchInner_ = batchInner; - batchOuter_ = CeilT(batchNumLess, batchInner); - batchA_ = batchInner; - batchB_ = batchInner; - } else { - while (batchNumLess % batchInner != 0 && batchInner > 0) { - --batchInner; - } - mainBatchInner_ = batchInner; - batchOuter_ = batchNumLess / batchInner; - if (isBatchALarger) { - batchA_ = multiples * batchInner; - batchB_ = batchInner; - } else { - batchA_ = batchInner; - batchB_ = multiples * batchInner; - } - } - } - - __aicore__ inline void UpdateBatchNumParams() - { - batchNum_ = batchA_ > batchB_ ? batchA_ : batchB_; - if constexpr (!IsBmmDoubleBuffer()) { - if (batchOuter_ > 1 && batchA_ == batchB_) { - splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; - splitBatchNum_ = batchNum_ / splitSize_; - } else { - splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) && (batchB_ % DB_FACTOR == 0) - ? DB_FACTOR - : 1; - splitBatchNum_ = batchNum_ / splitSize_; - } - } - } - - __aicore__ inline void UpdateSplitParams() - { - splitBatchIdx_ += splitBatchNum_; - } - - __aicore__ inline void UpdateInnerParams() - { - innerBatchIdx_ = innerIdx_ + splitBatchIdx_; - } - int32_t batchA_; // outerLoop main/tail block int32_t batchB_; // outerLoop main/tail block int32_t batchNum_; // outerLoop main/tail block @@ -452,10 +351,8 @@ private: int32_t batchOutOffsetNum_ = 0; int32_t inputBatchNum_ = 0; - bool oddAndLargeThanL1_ = false; // new logical judgment condition for handling odd batchNum && large than L1 - int32_t mainBatchInner_ = 0; // outerLoop main block }; } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_MULTI_H +#endif // IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_BASE_H diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_large.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_large.h new file mode 100644 index 0000000000000000000000000000000000000000..a3752a78331ea81d693a8680feeb7d9f5d5d9f5e --- /dev/null +++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_large.h @@ -0,0 +1,220 @@ +/* + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file batch_loop_batch_large.h + * \brief + */ +#ifndef IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LARGE_THAN_L1_H +#define IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LARGE_THAN_L1_H + +#include "batch_loop_intf.h" +#include "batch_loop_batch_base.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +/* + BatchLoop is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + BatchLoop is only for internal usage, does not support extension or customized specialization! +*/ +template +class BatchLoop> + : public BatchLoopBase +{ + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + using SrcT = typename INPUT_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + +public: + using BASE_MODULE = AscendC::Impl::Detail::BatchLoopBase; + __aicore__ inline BatchLoop() = default; + __aicore__ inline ~BatchLoop() = default; + + __aicore__ inline void Init() + { + const auto tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling(); + CalcBatchNum(tiling.GetALayoutInfoB(), tiling.GetBLayoutInfoB(), tiling.GetBatchNum(), tiling.GetBatchNum()); + UpdateBatchNumParams(); + } + + __aicore__ inline void SetBatchNum(int32_t batchNumA, int32_t batchNumB) + { + CalcBatchNum(batchNumA, batchNumB, batchNumA, batchNumB); + UpdateBatchNumParams(); + } + + __aicore__ inline void OuterNext() + { + BASE_MODULE::outerIdx_++; + BASE_MODULE::dstOffset_ += BASE_MODULE::batchCalcSize_; + if (oddAndLargeThanL1_ && BASE_MODULE::outerIdx_ == BASE_MODULE::batchOuter_ - 1) { + const int32_t tail = BASE_MODULE::inputBatchNum_ % BASE_MODULE::batchA_; + BASE_MODULE::batchA_ = tail == 0 ? mainBatchInner_ : tail; + BASE_MODULE::batchB_ = BASE_MODULE::batchA_; + BASE_MODULE::batchNum_ = BASE_MODULE::batchA_; + BASE_MODULE::batchCalcSize_ = BASE_MODULE::batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() * + MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + BASE_MODULE::splitSize_ = (BASE_MODULE::batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; + BASE_MODULE::splitBatchNum_ = BASE_MODULE::batchNum_ / BASE_MODULE::splitSize_; + } + } + + __aicore__ inline bool InnerEnd() + { + if ((!oddAndLargeThanL1_) || (BASE_MODULE::batchNum_ % DB_FACTOR == 0) || (BASE_MODULE::splitSize_ < DB_FACTOR)) { + return (BASE_MODULE::innerIdx_ >= BASE_MODULE::splitBatchNum_) || (BASE_MODULE::splitOuterIdx_ * BASE_MODULE::splitBatchNum_ >= BASE_MODULE::batchNum_); + } + const auto firstBatchNum = BASE_MODULE::batchNum_ / BASE_MODULE::splitSize_; + if (BASE_MODULE::splitOuterIdx_ < 1) { + return BASE_MODULE::innerIdx_ >= firstBatchNum; + } else { + return BASE_MODULE::innerIdx_ >= BASE_MODULE::batchNum_ - firstBatchNum; + } + } + + __aicore__ inline void CalcBatchOuterIdx(uint32_t& outerIdxA, uint32_t& outerIdxB) + { + if (outerLoop_ == 1 || BASE_MODULE::batchA_ == BASE_MODULE::batchB_) { + outerIdxA = BASE_MODULE::outerIdx_; + outerIdxB = BASE_MODULE::outerIdx_; + } else if (BASE_MODULE::batchA_ > BASE_MODULE::batchB_) { + outerIdxA = BASE_MODULE::outerIdx_; + outerIdxB = BASE_MODULE::outerIdx_ / outerLoop_; + } else { + outerIdxA = BASE_MODULE::outerIdx_ / outerLoop_; + outerIdxB = BASE_MODULE::outerIdx_; + } + } + + __aicore__ inline int32_t GetMainBatchBlockA() const + { + return oddAndLargeThanL1_ ? mainBatchInner_ : BASE_MODULE::batchA_; // batchNum main block in outLoop + } + + __aicore__ inline int32_t GetMainBatchBlockB() const + { + return oddAndLargeThanL1_ ? mainBatchInner_ : BASE_MODULE::batchB_; // batchNum main block in outLoop + } + + __aicore__ inline int32_t GetBiasBatchSrcOffset() const + { + return BASE_MODULE::outerIdx_ * (oddAndLargeThanL1_ ? mainBatchInner_ : BASE_MODULE::batchNum_) * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + } + +private: + __aicore__ inline void CalcBatchNum(int32_t layoutBatchNumA, int32_t layoutBatchNumB, + int32_t batchNumA, int32_t batchNumB) + { + BASE_MODULE::totalBatchNum_ = batchNumA > batchNumB ? batchNumA : batchNumB; + + ASSERT(layoutBatchNumA > 0 && layoutBatchNumB > 0 && + (layoutBatchNumA % layoutBatchNumB == 0 || layoutBatchNumB % layoutBatchNumA == 0)); + int32_t aMatrixSingleBatchSize = BASE_MODULE::GetSingleSizeAlignA(); + int32_t bMatrixSingleBatchSize = BASE_MODULE::GetSingleSizeAlignB(); + if ((layoutBatchNumA * aMatrixSingleBatchSize + layoutBatchNumB * bMatrixSingleBatchSize + + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN() * sizeof(BiasT)) <= TOTAL_L1_SIZE) { + BASE_MODULE::batchA_ = layoutBatchNumA; + BASE_MODULE::batchB_ = layoutBatchNumB; + return; + } + int32_t batchNumLarge; + int32_t batchNumLess; + int32_t largeMatrixSingleBatchSize; + int32_t lessMatrixSingleBatchSize; + if (layoutBatchNumA >= layoutBatchNumB) { + batchNumLarge = layoutBatchNumA; + batchNumLess = layoutBatchNumB; + largeMatrixSingleBatchSize = aMatrixSingleBatchSize; + lessMatrixSingleBatchSize = bMatrixSingleBatchSize; + } else { + batchNumLarge = layoutBatchNumB; + batchNumLess = layoutBatchNumA; + largeMatrixSingleBatchSize = bMatrixSingleBatchSize; + lessMatrixSingleBatchSize = aMatrixSingleBatchSize; + } + CalcBatchAB(batchNumLarge, batchNumLess, largeMatrixSingleBatchSize, lessMatrixSingleBatchSize, layoutBatchNumA >= layoutBatchNumB); + } + + __aicore__ inline void CalcBatchAB(int32_t batchNumLarge, int32_t batchNumLess, int32_t largeMatrixSingleBatchSize, int32_t lessMatrixSingleBatchSize, bool isBatchALarger) + { + int32_t multiples = batchNumLarge / batchNumLess; + int32_t singleBatchSize = multiples * largeMatrixSingleBatchSize + lessMatrixSingleBatchSize + + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN() * sizeof(BiasT); + + int32_t batchInner = TOTAL_L1_SIZE / singleBatchSize; + BASE_MODULE::inputBatchNum_ = batchNumLarge; + oddAndLargeThanL1_ = (multiples == 1) && (BASE_MODULE::inputBatchNum_ % DB_FACTOR != 0); + if (batchInner <= 0) { + outerLoop_ = 1; + while (batchInner <= 0) { + outerLoop_ += 1; + while (multiples % outerLoop_ != 0 && outerLoop_ < multiples) { + outerLoop_ += 1; + } + singleBatchSize = multiples / outerLoop_ * largeMatrixSingleBatchSize + lessMatrixSingleBatchSize + + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN() * sizeof(BiasT); + batchInner = TOTAL_L1_SIZE / singleBatchSize; + } + multiples /= outerLoop_; + } + ASSERT(batchInner > 0); + if (oddAndLargeThanL1_) { + mainBatchInner_ = batchInner; + BASE_MODULE::batchOuter_ = CeilT(batchNumLess, batchInner); + BASE_MODULE::batchA_ = batchInner; + BASE_MODULE::batchB_ = batchInner; + } else { + while (batchNumLess % batchInner != 0 && batchInner > 0) { + --batchInner; + } + mainBatchInner_ = batchInner; + BASE_MODULE::batchOuter_ = batchNumLess / batchInner * outerLoop_; + if (isBatchALarger) { + BASE_MODULE::batchA_ = multiples * batchInner; + BASE_MODULE::batchB_ = batchInner; + } else { + BASE_MODULE::batchA_ = batchInner; + BASE_MODULE::batchB_ = multiples * batchInner; + } + } + } + + __aicore__ inline void UpdateBatchNumParams() + { + BASE_MODULE::batchNum_ = BASE_MODULE::batchA_ > BASE_MODULE::batchB_ ? BASE_MODULE::batchA_ : BASE_MODULE::batchB_; + if constexpr (!IsBmmDoubleBuffer()) { + if (BASE_MODULE::batchOuter_ > 1 && BASE_MODULE::batchA_ == BASE_MODULE::batchB_) { + BASE_MODULE::splitSize_ = (BASE_MODULE::batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; + BASE_MODULE::splitBatchNum_ = BASE_MODULE::batchNum_ / BASE_MODULE::splitSize_; + } else { + BASE_MODULE::splitSize_ = (BASE_MODULE::batchNum_ >= DB_FACTOR) && (BASE_MODULE::batchA_ % DB_FACTOR == 0) && + (BASE_MODULE::batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1; + BASE_MODULE::splitBatchNum_ = BASE_MODULE::batchNum_ / BASE_MODULE::splitSize_; + } + } + } + + int32_t outerLoop_ = 1; + bool oddAndLargeThanL1_ = false; // new logical judgment condition for handling odd batchNum && large than L1 + int32_t mainBatchInner_ = 0; // outerLoop main block +}; +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LARGE_THAN_L1_H diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_less.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_less.h new file mode 100644 index 0000000000000000000000000000000000000000..c30ed8be0e7939597a5fff0bbe90c5a36b38fed2 --- /dev/null +++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_batch_less.h @@ -0,0 +1,44 @@ +/* + * This program is free software, you can redistribute it and/or modify it. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file batch_loop_batch_less.h + * \brief + */ +#ifndef IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_BATCH_LESS_THAN_L1_H +#define IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_BATCH_LESS_THAN_L1_H + +#include "batch_loop_intf.h" +#include "batch_loop_batch_base.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +/* + BatchLoop is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + BatchLoop is only for internal usage, does not support extension or customized specialization! +*/ +template +class BatchLoop> + : public BatchLoopBase +{ +public: + using BASE_MODULE = AscendC::Impl::Detail::BatchLoopBase; + __aicore__ inline BatchLoop() = default; + __aicore__ inline ~BatchLoop() = default; +}; +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // IMPL_MATMUL_SCHEDULER_ITERATOR_BATCH_LOOP_BATCH_LOOP_BATCH_LESS_THAN_L1_H diff --git a/impl/matmul/utils/batch_matmul_utils.h b/impl/matmul/utils/batch_matmul_utils.h index 2f8f7f407dffc289f3defdc65325f25824c01caa..b9453aeafa4d308639b7ae8d47d118ece8146502 100644 --- a/impl/matmul/utils/batch_matmul_utils.h +++ b/impl/matmul/utils/batch_matmul_utils.h @@ -34,6 +34,91 @@ constexpr bool IsBmmBatchScheduler = DoMatmulNorm(MM_CFG) && template constexpr bool IsBmmSingleScheduler = DoMatmulNorm(MM_CFG) && (A_TYPE::layout == LayoutMode::NORMAL && ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1); + +struct BatchOffsetInfo +{ + int32_t modA; + int32_t divisorA; + int32_t alignA; + int32_t modB; + int32_t divisorB; + int32_t alignB; + int32_t modBias; + int32_t divisorBias; + int32_t alignBias; + bool setBiasFlag {false}; +}; +struct SplitParams +{ + int16_t axisL1Len; + int16_t kAxisL1Len; + int16_t axisL1Offset; + int16_t kAxisL1Offset; + int16_t axisL0Len; +}; + +struct BatchSchedulerContext +{ + int32_t offsetA; + int32_t offsetB; + int32_t offsetBias; + uint32_t reduceGNum; + bool isReduceG; + SplitParams aL0Params; + SplitParams bL0Params; +}; + +struct BmmOffset { + uint64_t offA = 0; + uint64_t offB = 0; + uint64_t offC = 0; +}; + +// It is invoked by the matmulV3 operator and cannot be removed at present +__aicore__ inline uint16_t CeilDiv(uint16_t num1, uint16_t num2) +{ + ASSERT(num2 > 0); + return (num1 + num2 - 1) / num2; +} + +// It is invoked by the matmulV3 operator and cannot be removed at present +__aicore__ inline uint16_t CeilAlign(uint16_t num1, uint16_t num2) +{ + ASSERT(num2 > 0); + return CeilDiv(num1, num2) * num2; +} + +template +__aicore__ inline uint64_t CalcNBatchoffset(uint32_t batchValue, uint32_t loopIdx, uint32_t layoutInfoN, uint32_t layoutInfoG, uint32_t layoutInfoD, uint32_t layoutInfoS) +{ + uint32_t alignedSingleCoreN = layoutInfoD; + if constexpr (INPUT_TYPE::format == CubeFormat::ND_ALIGN) { + alignedSingleCoreN = CeilAlign(layoutInfoD, AscendCUtils::GetC0Count(sizeof(typename INPUT_TYPE::T))); + } + uint64_t offset; + if constexpr (INPUT_TYPE::layout == LayoutMode::BNGS1S2 || INPUT_TYPE::layout == LayoutMode::NORMAL) { + offset = alignedSingleCoreN * layoutInfoS * batchValue * loopIdx * sizeof(typename INPUT_TYPE::T); + } else if constexpr (INPUT_TYPE::layout == LayoutMode::SBNGD) { + offset = alignedSingleCoreN * batchValue * loopIdx * sizeof(typename INPUT_TYPE::T); + } else if constexpr (INPUT_TYPE::layout == LayoutMode::BSNGD) { + uint64_t layoutBIdx = loopIdx * batchValue / (layoutInfoN * layoutInfoG); + uint64_t layoutNGOff = loopIdx * batchValue % (layoutInfoN * layoutInfoG); + offset = (layoutBIdx * alignedSingleCoreN * layoutInfoS * layoutInfoN * layoutInfoG + layoutNGOff * alignedSingleCoreN) * sizeof(typename INPUT_TYPE::T); + } + return offset; +} + +__aicore__ inline uint64_t GetBatchCNum(uint32_t batchA, uint32_t batchB, uint32_t aLayoutInfoG, uint32_t bLayoutInfoG, uint32_t cLayoutInfoG) +{ + uint32_t batchC = batchA > batchB ? batchA : batchB; + bool layoutGCondition = cLayoutInfoG == 1 && + (aLayoutInfoG != 1 || bLayoutInfoG != 1); + if (layoutGCondition) { + int32_t layoutG = bLayoutInfoG > aLayoutInfoG ? bLayoutInfoG : aLayoutInfoG; + batchC = batchC / layoutG; + } + return batchC; +} } // namespace AscendC #endif // IMPL_MATMUL_UTILS_BATCH_MATMUL_UTILS_H \ No newline at end of file diff --git a/impl/matmul/utils/matmul_module.h b/impl/matmul/utils/matmul_module.h index 7b937d24fb5cadc66e593a6eeb78c89e327944d4..edbd85b00318b66054d9369501235d6debdf10ae 100644 --- a/impl/matmul/utils/matmul_module.h +++ b/impl/matmul/utils/matmul_module.h @@ -163,6 +163,9 @@ MATMUL_PRIVATE_TEMPLATE::type; \ +friend typename AscendC::Impl::Detail::MatmulModuleBaseBase::type; \ +friend typename AscendC::Impl::Detail::MatmulModuleBaseBaseBase::type; \ +friend typename AscendC::Impl::Detail::MatmulModuleRoot::type; \ friend NAME #define MATMUL_ALLOW_USING_TEMPLATE_PRIVATE(NAME, ...) \ diff --git a/impl/matmul/utils/matmul_utils.h b/impl/matmul/utils/matmul_utils.h index 457ec68488c43f8b7380fc6c6300c8908016f86f..105e3ce5690fb535314ae14c0a7aac3839679b65 100644 --- a/impl/matmul/utils/matmul_utils.h +++ b/impl/matmul/utils/matmul_utils.h @@ -88,45 +88,12 @@ struct DataCopyOutParams { uint64_t cbufWorkspaceAddr = 0; }; -struct SplitParams -{ - int16_t axisL1Len; - int16_t kAxisL1Len; - int16_t axisL1Offset; - int16_t kAxisL1Offset; - int16_t axisL0Len; -}; - struct MxSplitParams : public SplitParams { int16_t kAuxMatrixL1Len; int16_t kAuxMatrixL1Offset; }; -struct BatchOffsetInfo -{ - int32_t modA; - int32_t divisorA; - int32_t alignA; - int32_t modB; - int32_t divisorB; - int32_t alignB; - int32_t modBias; - int32_t divisorBias; - int32_t alignBias; - bool setBiasFlag {false}; -}; - -struct BatchSchedulerContext -{ - int32_t offsetA; - int32_t offsetB; - int32_t offsetBias; - uint32_t reduceGNum; - bool isReduceG; - SplitParams aL0Params; - SplitParams bL0Params; -}; template __aicore__ inline constexpr int32_t GetC0Size() { if (sizeof(SrcT) == sizeof(float)) { @@ -574,20 +541,6 @@ __aicore__ inline T CeilAlign(T num1, T num2) return Ceil(num1, num2) * num2; } -// It is invoked by the matmulV3 operator and cannot be removed at present -__aicore__ inline uint16_t CeilDiv(uint16_t num1, uint16_t num2) -{ - ASSERT(num2 > 0); - return (num1 + num2 - 1) / num2; -} - -// It is invoked by the matmulV3 operator and cannot be removed at present -__aicore__ inline uint16_t CeilAlign(uint16_t num1, uint16_t num2) -{ - ASSERT(num2 > 0); - return CeilDiv(num1, num2) * num2; -} - template __aicore__ inline constexpr bool IsL0ACache() { diff --git a/impl/normalization/deepnorm/deepnorm_common_impl.h b/impl/normalization/deepnorm/deepnorm_common_impl.h index 33d3f8d8de08f38d8bc99ff5698f27f9b7e86657..48504b4c80c266cea2dee6d49a26c9be500338ec 100644 --- a/impl/normalization/deepnorm/deepnorm_common_impl.h +++ b/impl/normalization/deepnorm/deepnorm_common_impl.h @@ -372,7 +372,7 @@ __aicore__ inline void GetDeepNormOutputPre(const LocalTensor& xSubMean, PipeBarrier(); // all 1 tensor - Duplicate(params.tempTensorC, float(1.0), 1, 1, 1, 8); + Duplicate(params.tempTensorC, static_cast(1.0), 1, 1, 1, 8); PipeBarrier(); // SqrtX = sqrt(addX) diff --git a/impl/normalization/layernorm/layernorm_common_impl.h b/impl/normalization/layernorm/layernorm_common_impl.h index ae1aab74595bbc65981eb38c024fd4bbea8666ab..ed7753592d6b9dac37f8477ba9902af357644682 100644 --- a/impl/normalization/layernorm/layernorm_common_impl.h +++ b/impl/normalization/layernorm/layernorm_common_impl.h @@ -233,7 +233,7 @@ __aicore__ inline void ComputeMeanVariance(const LocalTensor& outputMean, eventId = GetTPipePtr()->FetchEventID(HardEvent::S_V); for (uint32_t j = 0; j < tiling.aCurLength; j++) { - float scalar = float(-1) * outputMean.GetValue(j); + float scalar = static_cast(-1) * outputMean.GetValue(j); SetFlag(eventId); WaitFlag(eventId); Adds(params.tempTensorA[j * para.rLengthWithPadding], params.tempTensorA[j * para.rLengthWithPadding], diff --git a/impl/normalization/layernorm/layernorm_tiling_impl.cpp b/impl/normalization/layernorm/layernorm_tiling_impl.cpp index 2ce74faa88896f25443969d1fd60bfe75c2a4df4..628d560ba11744623dfdf01c889c01334e7e21e2 100644 --- a/impl/normalization/layernorm/layernorm_tiling_impl.cpp +++ b/impl/normalization/layernorm/layernorm_tiling_impl.cpp @@ -316,7 +316,7 @@ void GetLayerNormNDTilingInfo(const ge::Shape& srcShape, const uint32_t stackBuf const uint32_t arCurLength = inputRoundSize; const uint32_t aCurLength = meanVarRoundSize; - const float rValueBack = float(1) / static_cast(rLength); + const float rValueBack = static_cast(1) / static_cast(rLength); tiling.set_aLength(aLength); tiling.set_rLength(rLength); diff --git a/impl/normalization/normalize/normalize_common_impl.h b/impl/normalization/normalize/normalize_common_impl.h index 737da6c9821d4b0358d0eabf90c761c9db737e43..ce751300058c0abe3273a7460c0a046f39ddb582 100644 --- a/impl/normalization/normalize/normalize_common_impl.h +++ b/impl/normalization/normalize/normalize_common_impl.h @@ -90,7 +90,7 @@ __aicore__ inline void GetNormalizeOutputRstd(const LocalTensor& dstRstd, Adds(dstRstd, srcVar, epsilon, MASK_PLACEHOLDER, 1, unaryParams); PipeBarrier(); // 2. Rsqrt(AddsX) = 1 / Sqrt(AddsX) ==> dstRstd - Duplicate(tmpTensor.tempTensorA, float(1), 1, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); + Duplicate(tmpTensor.tempTensorA, static_cast(1), 1, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); Sqrt(dstRstd, dstRstd, MASK_PLACEHOLDER, 1, unaryParams); PipeBarrier(); diff --git a/impl/pad/pad/pad_base_impl.h b/impl/pad/pad/pad_base_impl.h index 222ec653e02f8ed92ad82137bf615f609f848afc..a8745883d611bab2a73206d5c5d99259cb882772 100644 --- a/impl/pad/pad/pad_base_impl.h +++ b/impl/pad/pad/pad_base_impl.h @@ -19,15 +19,6 @@ #include "kernel_pop_stack_buffer.h" #include "kernel_tiling/kernel_tiling.h" -#if __CCE_AICORE__ == 100 -#include "dav_c100/kernel_operator_vec_transpose_impl.h" -#elif __CCE_AICORE__ == 200 -#include "dav_m200/kernel_operator_vec_transpose_impl.h" -#elif __CCE_AICORE__ == 220 -#include "dav_c220/kernel_operator_vec_transpose_impl.h" -#include "dav_c220/kernel_operator_vec_gather_mask_impl.h" -#endif - namespace AscendC { template __aicore__ inline void DuplicateLastDimImpl(const LocalTensor &dstTensor, const LocalTensor &srcTensor, diff --git a/impl/sort/topk/topk_tiling_impl.cpp b/impl/sort/topk/topk_tiling_impl.cpp index 3f807cf248f7dfe80f5770eccdf6d09a13daef95..459a97ccfc26e8ba0c7fd8ca9d23c15e09be525d 100644 --- a/impl/sort/topk/topk_tiling_impl.cpp +++ b/impl/sort/topk/topk_tiling_impl.cpp @@ -368,9 +368,6 @@ void CheckTopKHostCommon(const char *apiName, const char *hostFuncName, if (mode == TopKMode::TOPK_NSMALL) { ASCENDC_HOST_ASSERT(inner == 32, return, "[%s][%s] In Small mode, the length of the inner axis must be 32!", apiName, hostFuncName); - } else if (mode == TopKMode::TOPK_NORMAL) { - ASCENDC_HOST_ASSERT(inner <= 4096, return, - "[%s][%s] In Normal mode, the maximum length of the inner axis is 4096!", apiName, hostFuncName); } if (socVersion == platform_ascendc::SocVersion::ASCEND310P && isInitIndex == false) { ASCENDC_HOST_ASSERT(inner <= 2048, return, @@ -381,9 +378,9 @@ void CheckTopKHostCommon(const char *apiName, const char *hostFuncName, "[%s][%s] The length of the outter axis must be greater than 0!", apiName, hostFuncName); uint64_t ubSize = 0; ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize); - ASCENDC_HOST_ASSERT(uint64_t(inner * outter * dataTypeSize) <= ubSize, return, + ASCENDC_HOST_ASSERT(static_cast(inner * outter * dataTypeSize) <= ubSize, return, "[%s][%s] The size of srcShape is %luB, should be less than UB size.", apiName, hostFuncName, - uint64_t(inner * outter * dataTypeSize)); + static_cast(inner * outter * dataTypeSize)); ASCENDC_HOST_ASSERT(dataTypeSize == TOPK_HALF_SIZE || dataTypeSize == TOPK_FLOAT_SIZE, return, "[%s][%s] Type size %u is unsupported!", apiName, hostFuncName, dataTypeSize); return; diff --git a/impl/transpose/confusion_transpose/confusion_transpose_base_0213.h b/impl/transpose/confusion_transpose/confusion_transpose_base_0213.h index 9adb2316bf354865923f5dfb72751822b24a40ff..9e0db9fac5093ba63ae447482bed45aa1f599c04 100644 --- a/impl/transpose/confusion_transpose/confusion_transpose_base_0213.h +++ b/impl/transpose/confusion_transpose/confusion_transpose_base_0213.h @@ -18,13 +18,6 @@ #include "kernel_pop_stack_buffer.h" #include "kernel_tiling/kernel_tiling.h" -#if __CCE_AICORE__ == 100 -#include "dav_c100/kernel_operator_vec_transpose_impl.h" -#elif __CCE_AICORE__ == 200 -#include "dav_m200/kernel_operator_vec_transpose_impl.h" -#elif __CCE_AICORE__ == 220 -#include "dav_c220/kernel_operator_vec_transpose_impl.h" -#endif namespace AscendC { const uint32_t CUBE_HALF_SIZE = CUBE_MAX_SIZE / 2; diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h index 68d30001ff184681c84370b0593b4539573ede8e..ce936845dbdc3f5786862f0b7fbd17dbf7af9305 100644 --- a/lib/matmul/matmul_client.h +++ b/lib/matmul/matmul_client.h @@ -1080,8 +1080,8 @@ public: { static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, "IterateNBatch not support when enableMixDualMaster is enabled."); - static_assert(A_TYPE::layout != LayoutMode::NONE && B_TYPE::layout != LayoutMode::NONE && - A_TYPE::layout != LayoutMode::NORMAL && B_TYPE::layout != LayoutMode::NORMAL && C_TYPE::layout != LayoutMode::NORMAL, + static_assert(A_TYPE::layout != LayoutMode::NONE && B_TYPE::layout != LayoutMode::NONE && + A_TYPE::layout != LayoutMode::NORMAL && B_TYPE::layout != LayoutMode::NORMAL && C_TYPE::layout != LayoutMode::NORMAL, "BMM does not support the layout being NONE or NORMAL"); if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { return; diff --git a/tests/activation/gelu/test_operator_fast_gelu.cpp b/tests/activation/gelu/test_operator_fast_gelu.cpp index ec97dba56d061217fc444e4bd25bab50f1ea6932..d15dcfb2c4a31bd9e373677a88393ab42cce4fee 100644 --- a/tests/activation/gelu/test_operator_fast_gelu.cpp +++ b/tests/activation/gelu/test_operator_fast_gelu.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp index f4d98b554d2c2d9ab61a536a195eeed455b6023d..6974cdd1c8eacffe24fcac855d8c0337a517ed13 100644 --- a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp +++ b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/gelu/test_operator_gelu.cpp b/tests/activation/gelu/test_operator_gelu.cpp index 387727a056f0584137ba1e6466d7dbb1807d2ba2..c668d6597bc01a76add3d9e85a9e132575bd8828 100644 --- a/tests/activation/gelu/test_operator_gelu.cpp +++ b/tests/activation/gelu/test_operator_gelu.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/silu/test_operator_silu.cpp b/tests/activation/silu/test_operator_silu.cpp index a001b04e73d6f689984a7ca600b8d143c9f95387..ad7eeb158f86d04cb2afbc619a9715162afa9ac8 100644 --- a/tests/activation/silu/test_operator_silu.cpp +++ b/tests/activation/silu/test_operator_silu.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp index 1261bd042ad621903ef3f8abe3d84aaaf0b9fc9c..c7eacc538ecfb0ac2c88cdf6e36cd7135dbf8b70 100644 --- a/tests/activation/softmax/test_operator_softmax_v220.cpp +++ b/tests/activation/softmax/test_operator_softmax_v220.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/activation/softmax/test_operator_softmax_v300.cpp b/tests/activation/softmax/test_operator_softmax_v300.cpp index d154acdf180060c01319396dd77b909bc2ee26a4..c8e2e781c7164dd4414e97c61ed576e74ff19fd3 100644 --- a/tests/activation/softmax/test_operator_softmax_v300.cpp +++ b/tests/activation/softmax/test_operator_softmax_v300.cpp @@ -3,7 +3,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp b/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp index c93705e4926940553be53bd8e7c0b9ee402638cb..84f0888f9c907f1c8d6391470f5e83aa0fe9ffc1 100644 --- a/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp +++ b/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/activation/swish/test_operator_swish.cpp b/tests/activation/swish/test_operator_swish.cpp index 9e9103f3cd5519a781e54d18e6756975a75f4ea0..e37a5b6612c488535dc1e24f7d7ff7f9565fb765 100644 --- a/tests/activation/swish/test_operator_swish.cpp +++ b/tests/activation/swish/test_operator_swish.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/common/k3_pvwrap.h b/tests/common/k3_pvwrap.h index 4a34c8d9b686b6de76eb1988611a4e1355600c9e..69f61199697714e118891de0bf49a06616a57d9d 100644 --- a/tests/common/k3_pvwrap.h +++ b/tests/common/k3_pvwrap.h @@ -30,7 +30,7 @@ inline int svSize(svOpenArrayHandle buf, uint32_t size) // Constants const uint32_t PV_MAX_STEP = 1000000; const uint32_t PV_STEP_TIME_OUT = 1000; -const uint64_t PV_LAST_STEP = uint64_t(-1); +const uint64_t PV_LAST_STEP = static_cast(-1); // ------------------------------------------------------------------------------------------------- // APIs for SV, assume no multi-core, so no core_id arg needed diff --git a/tests/filter/dropout/test_operator_dropout.cpp b/tests/filter/dropout/test_operator_dropout.cpp index 7e7b9f41252a6341455e5fe70a51ede1f6e5309f..32dd61b41013286ea126f13ac5a0d7dd1fb1315a 100644 --- a/tests/filter/dropout/test_operator_dropout.cpp +++ b/tests/filter/dropout/test_operator_dropout.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/index/arithprogression/test_operator_arithprogression.cpp b/tests/index/arithprogression/test_operator_arithprogression.cpp index ca52d8a259cce0e2604ac1478a1a2e3c02441c64..9ac8d53f59e46e767d4ef600b51a272cf4b84a8b 100644 --- a/tests/index/arithprogression/test_operator_arithprogression.cpp +++ b/tests/index/arithprogression/test_operator_arithprogression.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/math/exp/test_operator_exphighprecision.cpp b/tests/math/exp/test_operator_exphighprecision.cpp index 882199b26e51200bc12152568d5b1e52212662b7..e0bbe7a523c90aca86eba7455af634cf719380f0 100644 --- a/tests/math/exp/test_operator_exphighprecision.cpp +++ b/tests/math/exp/test_operator_exphighprecision.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" diff --git a/tests/matmul/iterator/test_batch_loop.cpp b/tests/matmul/iterator/test_batch_loop.cpp index e74d5f0871c45b0ced7b2fc2d458326636d323bb..f15462c9046a951c83edb28d416d6de679933a9a 100644 --- a/tests/matmul/iterator/test_batch_loop.cpp +++ b/tests/matmul/iterator/test_batch_loop.cpp @@ -23,7 +23,7 @@ #include "impl/matmul/policy/matmul_private_modules.h" #include "impl/matmul/param/matmul_tensor_info.h" #include "impl/matmul/param/matmul_shape_tiling.h" -#include "impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h" +#include "impl/matmul/scheduler/iterator/batch_loop/batch_loop.h" using namespace std; using namespace AscendC; diff --git a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp index 435ceaa1a10afef839a7f264c37acfe6d6641ffb..eabdf635e10554972e030f1f2da53257e9cc331c 100644 --- a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp +++ b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp @@ -140,6 +140,10 @@ public: return 0; } + __aicore__ inline void CalcBatchOuterIdx(uint32_t& outerIdxA, uint32_t& outerIdxB) + { + } + private: uint32_t outerIdx_; uint32_t splitIdx_; diff --git a/tests/matmul/test_matmul_channel_split.cpp b/tests/matmul/test_matmul_channel_split.cpp index eef8b04566532cfe7d36e543e1378f5c14bc2846..f36face05f90a160247a0c87a0fae06c6c66fedb 100644 --- a/tests/matmul/test_matmul_channel_split.cpp +++ b/tests/matmul/test_matmul_channel_split.cpp @@ -260,8 +260,12 @@ __aicore__ inline void main_kernel_matmul_channel_split(GM_ADDR aGM, GM_ADDR bGM class TEST_KERNEL_MATMUL_CHANNEL_SPLIT : public testing::Test { protected: - void SetUp() {} - void TearDown() {} + void SetUp() { + AscendC::SetGCoreType(1); + } + void TearDown() { + AscendC::SetGCoreType(0); + } }; #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL_CHANNEL_SPLIT, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ diff --git a/tests/matmul/test_matmul_triangular.cpp b/tests/matmul/test_matmul_triangular.cpp index 82fd851f3c94728454bc890abe9afb3883d8fcbf..f03457260bddd143eba3789b26d414578dfd6397 100644 --- a/tests/matmul/test_matmul_triangular.cpp +++ b/tests/matmul/test_matmul_triangular.cpp @@ -212,8 +212,12 @@ __aicore__ inline void main_kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, class TEST_KERNEL_TRIAN_MATMUL : public testing::Test { protected: - void SetUp() {} - void TearDown() {} + void SetUp() { + AscendC::SetGCoreType(1); + } + void TearDown() { + AscendC::SetGCoreType(0); + } }; #define KERNEL_TRIAN_MATMUL_TESTCASE(TEST_KERNEL_TRIAN_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ diff --git a/tests/normalization/batchnorm/test_operator_batchnorm.cpp b/tests/normalization/batchnorm/test_operator_batchnorm.cpp index e255c92698306af584bc1ba0cc8b7c58ead086ec..facee7f5e7df1fa866da2428d951fddcfa3dd670 100644 --- a/tests/normalization/batchnorm/test_operator_batchnorm.cpp +++ b/tests/normalization/batchnorm/test_operator_batchnorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/deepnorm/test_operator_deepnorm.cpp b/tests/normalization/deepnorm/test_operator_deepnorm.cpp index bbc72b7f5227cac56de1d9388b540eb27ae73b2d..c6c1e9434a4fed95113083bceed6e68bfd001552 100644 --- a/tests/normalization/deepnorm/test_operator_deepnorm.cpp +++ b/tests/normalization/deepnorm/test_operator_deepnorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp index 7dd522b972f24d7d4f178682dd2f454e4547dbcb..71f98669f6fd19bec6483fda46ef9001c95e2c13 100644 --- a/tests/normalization/groupnorm/test_operator_groupnorm.cpp +++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp @@ -15,7 +15,7 @@ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernorm/test_operator_layernorm.cpp b/tests/normalization/layernorm/test_operator_layernorm.cpp index 5875945b7fe71017d333fb3123530abe89257c8f..a3bc7836a767bb7dd097935fc32b3e4894ba57eb 100644 --- a/tests/normalization/layernorm/test_operator_layernorm.cpp +++ b/tests/normalization/layernorm/test_operator_layernorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernorm/test_operator_layernormgrad.cpp b/tests/normalization/layernorm/test_operator_layernormgrad.cpp index fab91ded70bd33167e92144e1895067c433b7541..81765f38a107560a4ae16e8f73eab92a01561ea6 100644 --- a/tests/normalization/layernorm/test_operator_layernormgrad.cpp +++ b/tests/normalization/layernorm/test_operator_layernormgrad.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp b/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp index 21dd3578e933f8f8532571601022a757841441fc..30d827cb4493f7cd7418bfef2cf8abd9fe89d16d 100644 --- a/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp +++ b/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernormV2/test_operator_layernormV2.cpp b/tests/normalization/layernormV2/test_operator_layernormV2.cpp index 0af89bd937016bc9d6292b3105f5ccb0e851be3e..fe25a875c04fac7e7ecc5b2eb4e5186db2f0d019 100644 --- a/tests/normalization/layernormV2/test_operator_layernormV2.cpp +++ b/tests/normalization/layernormV2/test_operator_layernormV2.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/normalize/test_operator_normalize.cpp b/tests/normalization/normalize/test_operator_normalize.cpp index fd975d90e6280eafe5326c407f9b53c6bd6c3253..17751b1d2b5a3e3f52f5f5e0e5aeb80c97ed31c5 100644 --- a/tests/normalization/normalize/test_operator_normalize.cpp +++ b/tests/normalization/normalize/test_operator_normalize.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp b/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp index 42d7072f77d66d95777e09a63ed4a4d94e2896fb..6cf9fb734b8288cd6e7658e14b79552baa6700f9 100644 --- a/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp +++ b/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp index c5b08ec14784671fd8c33d8ef78da54de7f2366e..1411599de9f87ad10ec3f6fb26f9104701a513e2 100644 --- a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp +++ b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include using namespace std; diff --git a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp index 654a40a667a8231fb4a12c60adfcbaf5dd40a81c..8833226c845bf6d758bfe39abe025742644ca1e5 100644 --- a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp +++ b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include using namespace std; diff --git a/tests/pad/pad/test_operator_pad.cpp b/tests/pad/pad/test_operator_pad.cpp index 8bed6873f30260fd17816b70a1825cb3c9a05b17..48b75a9d7df458fa969f8bd382bc8e018f6b13b6 100644 --- a/tests/pad/pad/test_operator_pad.cpp +++ b/tests/pad/pad/test_operator_pad.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include @@ -156,7 +156,7 @@ class UnPadTestsuite : public testing::Test, protected: void SetUp() { - AscendC::SetGCoreType(1); + AscendC::SetGCoreType(2); } void TearDown() { @@ -354,7 +354,7 @@ class PadTestsuite : public testing::Test, protected: void SetUp() { - AscendC::SetGCoreType(1); + AscendC::SetGCoreType(2); } void TearDown() { diff --git a/tests/quantization/antiquant/test_ascend_antiquant.cpp b/tests/quantization/antiquant/test_ascend_antiquant.cpp index 105d9151283832a4a330ea8a35e8d870a0e1a398..af9b3a6c882c62f0318fbb4ee6e2385dc05509c1 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" diff --git a/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp b/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp index f631cf0e31203ba3e6307d60cefc5843da5abb85..1128ed3d8d1f9a095bc81cf01ce66c34408f8ca0 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" diff --git a/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp b/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp index d66a237374e4205ea6b590e98e738b2ad3bc9da5..1ebd44718ab6fd3394fc7cd12f315ccba81a933d 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_operator_intf.h" #include "kernel_utils.h" diff --git a/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp b/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp index 571c86edb00d98681df8db35ce553bcf87d52914..d4641a89de006e054a7a4a8559298614256cf4ba 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" diff --git a/tests/reduce/mean/test_operator_mean.cpp b/tests/reduce/mean/test_operator_mean.cpp index eb96f1846f6254d1555e14431f06b7cb502963e3..cf457c2fb7ae83c9a1d5c18706c3b4c21374a222 100644 --- a/tests/reduce/mean/test_operator_mean.cpp +++ b/tests/reduce/mean/test_operator_mean.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include diff --git a/tests/reduce/sum/test_operator_sum.cpp b/tests/reduce/sum/test_operator_sum.cpp index 015bcf708c1472e8d0cf839e5c08c45d6eeec7f3..6ca219166e4b32853586f090f8e0f2db09e4169f 100644 --- a/tests/reduce/sum/test_operator_sum.cpp +++ b/tests/reduce/sum/test_operator_sum.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/sort/topk/test_operator_topk.cpp b/tests/sort/topk/test_operator_topk.cpp index a1f80967198878df07c453eb10f2e1f2ecccc334..ab8a14f6e0cadf4e176031104a94a9dc7366e81c 100644 --- a/tests/sort/topk/test_operator_topk.cpp +++ b/tests/sort/topk/test_operator_topk.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include using namespace std; diff --git a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp index c0624cc1485f1048d5b197d97b4ee116fcdbd38e..f3aa42cd9f07db2927ec0cf7cc95689247a7ef8e 100644 --- a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp +++ b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include